2016-10-27 12:06:44 +02:00
|
|
|
import re
|
2022-04-14 21:52:41 +02:00
|
|
|
from typing import Any, Callable, Match, Optional
|
2021-10-21 07:04:57 +02:00
|
|
|
from urllib.parse import urljoin
|
2019-05-02 17:13:20 +02:00
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
import magic
|
|
|
|
import requests
|
2019-05-02 17:14:08 +02:00
|
|
|
from django.conf import settings
|
2021-04-16 01:03:00 +02:00
|
|
|
from django.utils.encoding import smart_str
|
2019-05-02 17:13:20 +02:00
|
|
|
|
2019-05-05 20:56:54 +02:00
|
|
|
from version import ZULIP_VERSION
|
2022-04-14 21:57:20 +02:00
|
|
|
from zerver.lib.cache import cache_with_key, preview_url_cache_key
|
2021-05-07 03:54:25 +02:00
|
|
|
from zerver.lib.outgoing_http import OutgoingSession
|
2019-12-20 00:00:45 +01:00
|
|
|
from zerver.lib.pysa import mark_sanitized
|
2016-10-27 12:06:44 +02:00
|
|
|
from zerver.lib.url_preview.oembed import get_oembed_data
|
2020-06-11 00:54:34 +02:00
|
|
|
from zerver.lib.url_preview.parsers import GenericParser, OpenGraphParser
|
2022-04-14 21:52:41 +02:00
|
|
|
from zerver.lib.url_preview.types import UrlEmbedData, UrlOEmbedData
|
2016-10-27 12:06:44 +02:00
|
|
|
|
2019-05-02 17:14:08 +02:00
|
|
|
# Use an in-memory cache for development, to make it easy to develop this code
|
caches: Cache link preview data in memcached, not in PostgreSQL.
The choice to cache these in the database dates back to c93f1d4eda71,
with the comment added in da33b72848cc while working around the
durability of the "database" cache in local development.
The values were stored in a durable cache, as they needed to be
ensured to persist between when they were inserted in
`get_link_embed_data` and when they were used in
`render_incoming_message` via `link_embed_data_from_cache`.
However, database accesses are not fast compared to memcached, and we
wish to avoid the overhead of the database connection from the
`embed_links` worker. Specifically, making the connection may not be
thread-safe -- and in low-memory (and Docker) configurations, all
workers run as separate threads in a single process. This can lead to
stalled database connections in `embed_links` workers, and failed
previews.
Since the previous commit made the durability of the cache no longer
necessary, this will have minimal effect; at worst, posting the same
URL twice, on either side of an upgrade, will result in two preview
fetches of it.
2022-04-14 06:53:08 +02:00
|
|
|
CACHE_NAME = "default" if not settings.DEVELOPMENT else "in-memory"
|
2016-10-27 12:06:44 +02:00
|
|
|
# Based on django.core.validators.URLValidator, with ftp support removed.
|
|
|
|
link_regex = re.compile(
|
2021-02-12 08:20:45 +01:00
|
|
|
r"^(?:http)s?://" # http:// or https://
|
|
|
|
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
|
|
|
|
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
|
|
|
|
r"(?::\d+)?" # optional port
|
|
|
|
r"(?:/?|[/?]\S+)$",
|
2021-02-12 08:19:30 +01:00
|
|
|
re.IGNORECASE,
|
|
|
|
)
|
2020-04-23 21:26:31 +02:00
|
|
|
|
|
|
|
# Use Chrome User-Agent, since some sites refuse to work on old browsers
|
|
|
|
ZULIP_URL_PREVIEW_USER_AGENT = (
|
2021-02-12 08:20:45 +01:00
|
|
|
"Mozilla/5.0 (compatible; ZulipURLPreview/{version}; +{external_host})"
|
2021-01-22 21:31:50 +01:00
|
|
|
).format(version=ZULIP_VERSION, external_host=settings.ROOT_DOMAIN_URI)
|
2020-04-23 21:26:31 +02:00
|
|
|
|
2019-05-06 04:49:47 +02:00
|
|
|
# FIXME: This header and timeout are not used by pyoembed, when trying to autodiscover!
|
2021-02-12 08:20:45 +01:00
|
|
|
HEADERS = {"User-Agent": ZULIP_URL_PREVIEW_USER_AGENT}
|
2019-05-06 04:49:47 +02:00
|
|
|
TIMEOUT = 15
|
2016-10-27 12:06:44 +02:00
|
|
|
|
|
|
|
|
2021-05-07 03:54:25 +02:00
|
|
|
class PreviewSession(OutgoingSession):
|
|
|
|
def __init__(self) -> None:
|
|
|
|
super().__init__(role="preview", timeout=TIMEOUT, headers=HEADERS)
|
|
|
|
|
|
|
|
|
2021-03-18 02:11:28 +01:00
|
|
|
def is_link(url: str) -> Optional[Match[str]]:
|
2021-04-16 01:03:00 +02:00
|
|
|
return link_regex.match(smart_str(url))
|
2016-10-27 12:06:44 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-04 17:54:18 +02:00
|
|
|
def guess_mimetype_from_content(response: requests.Response) -> str:
|
|
|
|
mime_magic = magic.Magic(mime=True)
|
|
|
|
try:
|
|
|
|
content = next(response.iter_content(1000))
|
|
|
|
except StopIteration:
|
2021-02-12 08:20:45 +01:00
|
|
|
content = ""
|
2019-05-04 17:54:18 +02:00
|
|
|
return mime_magic.from_buffer(content)
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-04 17:54:18 +02:00
|
|
|
def valid_content_type(url: str) -> bool:
|
|
|
|
try:
|
2021-05-07 03:54:25 +02:00
|
|
|
response = PreviewSession().get(url, stream=True)
|
2019-05-04 17:54:18 +02:00
|
|
|
except requests.RequestException:
|
|
|
|
return False
|
|
|
|
|
|
|
|
if not response.ok:
|
|
|
|
return False
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
content_type = response.headers.get("content-type")
|
2019-05-04 17:54:18 +02:00
|
|
|
# Be accommodating of bad servers: assume content may be html if no content-type header
|
2021-02-12 08:20:45 +01:00
|
|
|
if not content_type or content_type.startswith("text/html"):
|
2019-05-04 17:54:18 +02:00
|
|
|
# Verify that the content is actually HTML if the server claims it is
|
|
|
|
content_type = guess_mimetype_from_content(response)
|
2021-02-12 08:20:45 +01:00
|
|
|
return content_type.startswith("text/html")
|
2019-05-04 17:54:18 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-10 14:29:33 +02:00
|
|
|
def catch_network_errors(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
|
|
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
|
|
try:
|
|
|
|
return func(*args, **kwargs)
|
|
|
|
except requests.exceptions.RequestException:
|
|
|
|
pass
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-10 14:29:33 +02:00
|
|
|
return wrapper
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-10 14:29:33 +02:00
|
|
|
@catch_network_errors
|
2018-10-14 14:41:15 +02:00
|
|
|
@cache_with_key(preview_url_cache_key, cache_name=CACHE_NAME, with_statsd_key="urlpreview_data")
|
2021-02-12 08:19:30 +01:00
|
|
|
def get_link_embed_data(
|
|
|
|
url: str, maxwidth: int = 640, maxheight: int = 480
|
2022-04-14 21:52:41 +02:00
|
|
|
) -> Optional[UrlEmbedData]:
|
2016-10-27 12:06:44 +02:00
|
|
|
if not is_link(url):
|
|
|
|
return None
|
2019-05-04 17:54:18 +02:00
|
|
|
|
|
|
|
if not valid_content_type(url):
|
|
|
|
return None
|
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
# The oembed data from pyoembed may be complete enough to return
|
|
|
|
# as-is; if so, we use it. Otherwise, we use it as a _base_ for
|
|
|
|
# the other, less sophisticated techniques which we apply as
|
|
|
|
# successive fallbacks.
|
|
|
|
data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
|
|
|
|
if data is not None and isinstance(data, UrlOEmbedData):
|
2019-05-26 06:27:01 +02:00
|
|
|
return data
|
2019-12-12 02:10:50 +01:00
|
|
|
|
2021-05-07 03:54:25 +02:00
|
|
|
response = PreviewSession().get(mark_sanitized(url), stream=True)
|
2022-04-14 21:52:41 +02:00
|
|
|
if not response.ok:
|
|
|
|
return None
|
|
|
|
|
|
|
|
if data is None:
|
|
|
|
data = UrlEmbedData()
|
|
|
|
|
|
|
|
for parser_class in (OpenGraphParser, GenericParser):
|
|
|
|
parser = parser_class(response.content, response.headers.get("Content-Type"))
|
|
|
|
data.merge(parser.extract_data())
|
|
|
|
|
|
|
|
if data.image:
|
|
|
|
data.image = urljoin(response.url, data.image)
|
2016-10-27 12:06:44 +02:00
|
|
|
return data
|