2016-10-27 12:06:44 +02:00
|
|
|
import re
|
2022-04-14 21:52:41 +02:00
|
|
|
from typing import Any, Callable, Match, Optional
|
2021-10-21 07:04:57 +02:00
|
|
|
from urllib.parse import urljoin
|
2019-05-02 17:13:20 +02:00
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
import magic
|
|
|
|
import requests
|
2019-05-02 17:14:08 +02:00
|
|
|
from django.conf import settings
|
2021-04-16 01:03:00 +02:00
|
|
|
from django.utils.encoding import smart_str
|
2019-05-02 17:13:20 +02:00
|
|
|
|
2019-05-05 20:56:54 +02:00
|
|
|
from version import ZULIP_VERSION
|
2022-04-14 21:57:20 +02:00
|
|
|
from zerver.lib.cache import cache_with_key, preview_url_cache_key
|
2021-05-07 03:54:25 +02:00
|
|
|
from zerver.lib.outgoing_http import OutgoingSession
|
2019-12-20 00:00:45 +01:00
|
|
|
from zerver.lib.pysa import mark_sanitized
|
2016-10-27 12:06:44 +02:00
|
|
|
from zerver.lib.url_preview.oembed import get_oembed_data
|
2020-06-11 00:54:34 +02:00
|
|
|
from zerver.lib.url_preview.parsers import GenericParser, OpenGraphParser
|
2022-04-14 21:52:41 +02:00
|
|
|
from zerver.lib.url_preview.types import UrlEmbedData, UrlOEmbedData
|
2016-10-27 12:06:44 +02:00
|
|
|
|
|
|
|
# Based on django.core.validators.URLValidator, with ftp support removed.
|
|
|
|
link_regex = re.compile(
|
2021-02-12 08:20:45 +01:00
|
|
|
r"^(?:http)s?://" # http:// or https://
|
|
|
|
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|" # domain...
|
|
|
|
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # ...or ip
|
|
|
|
r"(?::\d+)?" # optional port
|
|
|
|
r"(?:/?|[/?]\S+)$",
|
2021-02-12 08:19:30 +01:00
|
|
|
re.IGNORECASE,
|
|
|
|
)
|
2020-04-23 21:26:31 +02:00
|
|
|
|
|
|
|
# Use Chrome User-Agent, since some sites refuse to work on old browsers
|
|
|
|
ZULIP_URL_PREVIEW_USER_AGENT = (
|
2021-02-12 08:20:45 +01:00
|
|
|
"Mozilla/5.0 (compatible; ZulipURLPreview/{version}; +{external_host})"
|
2021-01-22 21:31:50 +01:00
|
|
|
).format(version=ZULIP_VERSION, external_host=settings.ROOT_DOMAIN_URI)
|
2020-04-23 21:26:31 +02:00
|
|
|
|
2019-05-06 04:49:47 +02:00
|
|
|
# FIXME: This header and timeout are not used by pyoembed, when trying to autodiscover!
|
2021-02-12 08:20:45 +01:00
|
|
|
HEADERS = {"User-Agent": ZULIP_URL_PREVIEW_USER_AGENT}
|
2019-05-06 04:49:47 +02:00
|
|
|
TIMEOUT = 15
|
2016-10-27 12:06:44 +02:00
|
|
|
|
|
|
|
|
2021-05-07 03:54:25 +02:00
|
|
|
class PreviewSession(OutgoingSession):
|
|
|
|
def __init__(self) -> None:
|
|
|
|
super().__init__(role="preview", timeout=TIMEOUT, headers=HEADERS)
|
|
|
|
|
|
|
|
|
2021-03-18 02:11:28 +01:00
|
|
|
def is_link(url: str) -> Optional[Match[str]]:
|
2021-04-16 01:03:00 +02:00
|
|
|
return link_regex.match(smart_str(url))
|
2016-10-27 12:06:44 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-04 17:54:18 +02:00
|
|
|
def guess_mimetype_from_content(response: requests.Response) -> str:
|
|
|
|
mime_magic = magic.Magic(mime=True)
|
|
|
|
try:
|
|
|
|
content = next(response.iter_content(1000))
|
|
|
|
except StopIteration:
|
2021-02-12 08:20:45 +01:00
|
|
|
content = ""
|
2019-05-04 17:54:18 +02:00
|
|
|
return mime_magic.from_buffer(content)
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-04 17:54:18 +02:00
|
|
|
def valid_content_type(url: str) -> bool:
|
|
|
|
try:
|
2021-05-07 03:54:25 +02:00
|
|
|
response = PreviewSession().get(url, stream=True)
|
2019-05-04 17:54:18 +02:00
|
|
|
except requests.RequestException:
|
|
|
|
return False
|
|
|
|
|
|
|
|
if not response.ok:
|
|
|
|
return False
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
content_type = response.headers.get("content-type")
|
2019-05-04 17:54:18 +02:00
|
|
|
# Be accommodating of bad servers: assume content may be html if no content-type header
|
2021-02-12 08:20:45 +01:00
|
|
|
if not content_type or content_type.startswith("text/html"):
|
2019-05-04 17:54:18 +02:00
|
|
|
# Verify that the content is actually HTML if the server claims it is
|
|
|
|
content_type = guess_mimetype_from_content(response)
|
2021-02-12 08:20:45 +01:00
|
|
|
return content_type.startswith("text/html")
|
2019-05-04 17:54:18 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-10 14:29:33 +02:00
|
|
|
def catch_network_errors(func: Callable[..., Any]) -> Callable[..., Any]:
|
|
|
|
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
|
|
|
try:
|
|
|
|
return func(*args, **kwargs)
|
|
|
|
except requests.exceptions.RequestException:
|
|
|
|
pass
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-10 14:29:33 +02:00
|
|
|
return wrapper
|
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2019-05-10 14:29:33 +02:00
|
|
|
@catch_network_errors
|
2023-04-26 03:09:19 +02:00
|
|
|
@cache_with_key(preview_url_cache_key)
|
2021-02-12 08:19:30 +01:00
|
|
|
def get_link_embed_data(
|
|
|
|
url: str, maxwidth: int = 640, maxheight: int = 480
|
2022-04-14 21:52:41 +02:00
|
|
|
) -> Optional[UrlEmbedData]:
|
2016-10-27 12:06:44 +02:00
|
|
|
if not is_link(url):
|
|
|
|
return None
|
2019-05-04 17:54:18 +02:00
|
|
|
|
|
|
|
if not valid_content_type(url):
|
|
|
|
return None
|
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
# The oembed data from pyoembed may be complete enough to return
|
|
|
|
# as-is; if so, we use it. Otherwise, we use it as a _base_ for
|
|
|
|
# the other, less sophisticated techniques which we apply as
|
|
|
|
# successive fallbacks.
|
|
|
|
data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
|
|
|
|
if data is not None and isinstance(data, UrlOEmbedData):
|
2019-05-26 06:27:01 +02:00
|
|
|
return data
|
2019-12-12 02:10:50 +01:00
|
|
|
|
2021-05-07 03:54:25 +02:00
|
|
|
response = PreviewSession().get(mark_sanitized(url), stream=True)
|
2022-04-14 21:52:41 +02:00
|
|
|
if not response.ok:
|
|
|
|
return None
|
|
|
|
|
|
|
|
if data is None:
|
|
|
|
data = UrlEmbedData()
|
|
|
|
|
|
|
|
for parser_class in (OpenGraphParser, GenericParser):
|
|
|
|
parser = parser_class(response.content, response.headers.get("Content-Type"))
|
|
|
|
data.merge(parser.extract_data())
|
|
|
|
|
|
|
|
if data.image:
|
|
|
|
data.image = urljoin(response.url, data.image)
|
2016-10-27 12:06:44 +02:00
|
|
|
return data
|