2016-10-27 12:06:44 +02:00
|
|
|
import re
|
|
|
|
import logging
|
|
|
|
import traceback
|
2018-05-10 19:13:36 +02:00
|
|
|
from typing import Any, Optional, Dict
|
2016-10-27 12:06:44 +02:00
|
|
|
from typing.re import Match
|
|
|
|
import requests
|
|
|
|
from zerver.lib.cache import cache_with_key, get_cache_with_key
|
|
|
|
from zerver.lib.url_preview.oembed import get_oembed_data
|
|
|
|
from zerver.lib.url_preview.parsers import OpenGraphParser, GenericParser
|
2017-06-16 00:23:35 +02:00
|
|
|
from django.utils.encoding import smart_text
|
2016-10-27 12:06:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
CACHE_NAME = "database"
|
|
|
|
# Based on django.core.validators.URLValidator, with ftp support removed.
|
|
|
|
link_regex = re.compile(
|
|
|
|
r'^(?:http)s?://' # http:// or https://
|
|
|
|
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
|
|
|
|
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
|
|
|
|
r'(?::\d+)?' # optional port
|
|
|
|
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
|
|
|
|
|
|
|
|
|
2018-05-10 19:13:36 +02:00
|
|
|
def is_link(url: str) -> Match[str]:
|
2017-06-16 00:23:35 +02:00
|
|
|
return link_regex.match(smart_text(url))
|
2016-10-27 12:06:44 +02:00
|
|
|
|
|
|
|
|
2018-05-10 19:13:36 +02:00
|
|
|
def cache_key_func(url: str) -> str:
|
2016-10-27 12:06:44 +02:00
|
|
|
return url
|
|
|
|
|
|
|
|
|
|
|
|
@cache_with_key(cache_key_func, cache_name=CACHE_NAME, with_statsd_key="urlpreview_data")
|
2018-05-10 19:13:36 +02:00
|
|
|
def get_link_embed_data(url: str,
|
2017-11-05 11:15:10 +01:00
|
|
|
maxwidth: Optional[int]=640,
|
|
|
|
maxheight: Optional[int]=480) -> Optional[Dict[Any, Any]]:
|
2016-10-27 12:06:44 +02:00
|
|
|
if not is_link(url):
|
|
|
|
return None
|
|
|
|
# Fetch information from URL.
|
|
|
|
# We are using three sources in next order:
|
|
|
|
# 1. OEmbed
|
|
|
|
# 2. Open Graph
|
|
|
|
# 3. Meta tags
|
|
|
|
try:
|
|
|
|
data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
|
|
|
|
except requests.exceptions.RequestException:
|
|
|
|
msg = 'Unable to fetch information from url {0}, traceback: {1}'
|
|
|
|
logging.error(msg.format(url, traceback.format_exc()))
|
|
|
|
return None
|
|
|
|
data = data or {}
|
|
|
|
response = requests.get(url)
|
|
|
|
if response.ok:
|
|
|
|
og_data = OpenGraphParser(response.text).extract_data()
|
|
|
|
if og_data:
|
|
|
|
data.update(og_data)
|
|
|
|
generic_data = GenericParser(response.text).extract_data() or {}
|
|
|
|
for key in ['title', 'description', 'image']:
|
|
|
|
if not data.get(key) and generic_data.get(key):
|
|
|
|
data[key] = generic_data[key]
|
|
|
|
return data
|
|
|
|
|
|
|
|
|
|
|
|
@get_cache_with_key(cache_key_func, cache_name=CACHE_NAME)
|
2018-05-10 19:13:36 +02:00
|
|
|
def link_embed_data_from_cache(url: str, maxwidth: Optional[int]=640, maxheight: Optional[int]=480) -> Any:
|
2016-10-27 12:06:44 +02:00
|
|
|
return
|