Add oembed/Open Graph/Meta tags data retrieval from inline links.

This change adds support for displaying inline open graph previews for
links posted into Zulip.

It is designed to interact correctly with message editing.

This adds the new settings.INLINE_URL_EMBED_PREVIEW setting to control
whether this feature is enabled.

By default, this setting is currently disabled, so that we can burn it
in for a bit before it impacts users more broadly.

Eventually, we may want to make this manageable via a (set of?)
per-realm settings.  E.g. I can imagine a realm wanting to be able to
enable/disable it for certain URLs.
This commit is contained in:
Igor Tokarev 2016-10-27 15:06:44 +05:00 committed by Tim Abbott
parent b68fef8933
commit c93f1d4eda
23 changed files with 464 additions and 14 deletions

View File

@ -240,6 +240,20 @@ stdout_logfile_maxbytes=1GB ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10 ; # of stdout logfile backups (default 10)
directory=/home/zulip/deployments/current/
[program:zulip-events-embed_links]
command=/home/zulip/deployments/current/manage.py process_queue --queue_name=embed_links
priority=600 ; the relative start priority (default 999)
autostart=true ; start at supervisord start (default: true)
autorestart=true ; whether/when to restart (default: unexpected)
stopsignal=TERM ; signal used to kill process (default TERM)
stopwaitsecs=30 ; max num secs to wait b4 SIGKILL (default 10)
user=zulip ; setuid to this UNIX account to run the program
redirect_stderr=true ; redirect proc stderr to stdout (default false)
stdout_logfile=/var/log/zulip/events-embed_links.log ; stdout log path, NONE for none; default AUTO
stdout_logfile_maxbytes=1GB ; max # logfile bytes b4 rotation (default 50MB)
stdout_logfile_backups=10 ; # of stdout logfile backups (default 10)
directory=/home/zulip/deployments/current/
[program:zulip-deliver-enqueued-emails]
command=/home/zulip/deployments/current/manage.py deliver_email
priority=600 ; the relative start priority (default 999)
@ -261,7 +275,7 @@ directory=/home/zulip/deployments/current/
[group:zulip-workers]
; each refers to 'x' in [program:x] definitions
programs=zulip-events-user-activity,zulip-events-user-activity-interval,zulip-events-user-presence,zulip-events-signups,zulip-events-confirmation-emails,zulip-events-missedmessage_reminders,zulip-events-slowqueries,zulip-events-feedback_messages,zulip-events-digest_emails,zulip-events-error_reports,zulip-deliver-enqueued-emails,zulip-events-missedmessage_mobile_notifications,zulip-events-email_mirror
programs=zulip-events-user-activity,zulip-events-user-activity-interval,zulip-events-user-presence,zulip-events-signups,zulip-events-confirmation-emails,zulip-events-missedmessage_reminders,zulip-events-slowqueries,zulip-events-feedback_messages,zulip-events-digest_emails,zulip-events-error_reports,zulip-deliver-enqueued-emails,zulip-events-missedmessage_mobile_notifications,zulip-events-email_mirror,zulip-events-embed_links
[group:zulip-senders]
programs=zulip-events-message_sender

View File

@ -171,3 +171,7 @@ ijson==2.3
#for pep8 linter
pycodestyle==2.1.0
# Needed for link preview
beautifulsoup4==4.5.1
git+https://github.com/rafaelmartins/pyoembed.git@eb9901917c2a44b49e2887c077ead84a722c50dc#egg=pyoembed

View File

@ -2483,3 +2483,17 @@ button.topic_edit_cancel {
.add-user-list-filter {
width: 80%;
}
/* embed */
.message_embed {
border-left: 4px solid #ccc;
padding-left: 5px;
display: block;
margin-bottom: 10px;
}
.message_embed_title {
font-weight: bold;
}
.message_embed_image {
max-width: 80px;
}

View File

@ -1,2 +1,2 @@
ZULIP_VERSION = "1.4.1+git"
PROVISION_VERSION = '2.4'
PROVISION_VERSION = '2.5'

View File

@ -802,6 +802,7 @@ def do_send_messages(messages):
message['active_recipients'] = [user_profile for user_profile in message['recipients']
if user_profile.is_active]
links_for_embed = set() # type: Set[text_type]
# Render our messages.
for message in messages:
assert message['message'].rendered_content is None
@ -811,6 +812,7 @@ def do_send_messages(messages):
message_users=message['active_recipients'])
message['message'].rendered_content = rendered_content
message['message'].rendered_content_version = bugdown_version
links_for_embed |= message['message'].links_for_preview
for message in messages:
message['message'].update_calculated_fields()
@ -843,6 +845,7 @@ def do_send_messages(messages):
um.flags |= UserMessage.flags.has_alert_word
if is_me_message:
um.flags |= UserMessage.flags.is_me_message
user_message_flags[message['message'].id][um.user_profile_id] = um.flags_list()
ums.extend(ums_to_create)
UserMessage.objects.bulk_create(ums)
@ -892,6 +895,14 @@ def do_send_messages(messages):
if message['sender_queue_id'] is not None:
event['sender_queue_id'] = message['sender_queue_id']
send_event(event, users)
if settings.INLINE_URL_EMBED_PREVIEW and links_for_embed:
event_data = {
'message_id': message['message'].id,
'message_content': message['message'].content,
'urls': links_for_embed}
queue_json_publish('embed_links', event_data, lambda x: None)
if (settings.ENABLE_FEEDBACK and
message['message'].recipient.type == Recipient.PERSONAL and
settings.FEEDBACK_BOT in [up.email for up in message['recipients']]):
@ -2640,6 +2651,39 @@ def update_to_dict_cache(changed_messages):
cache_set_many(items_for_remote_cache)
return message_ids
# We use transaction.atomic to support select_for_update in the attachment codepath.
@transaction.atomic
def do_update_embedded_data(user_profile, message, content, rendered_content):
# type: (UserProfile, Message, Optional[text_type], Optional[text_type]) -> None
event = {
'type': 'update_message',
'sender': user_profile.email,
'message_id': message.id} # type: Dict[str, Any]
changed_messages = [message]
ums = UserMessage.objects.filter(message=message.id)
if content is not None:
update_user_message_flags(message, ums)
message.content = content
message.rendered_content = rendered_content
message.rendered_content_version = bugdown_version
event["content"] = content
event["rendered_content"] = rendered_content
log_event(event)
message.save(update_fields=["content", "rendered_content"])
event['message_ids'] = update_to_dict_cache(changed_messages)
def user_info(um):
# type: (UserMessage) -> Dict[str, Any]
return {
'id': um.user_profile_id,
'flags': um.flags_list()
}
send_event(event, list(map(user_info, ums)))
# We use transaction.atomic to support select_for_update in the attachment codepath.
@transaction.atomic
def do_update_message(user_profile, message, subject, propagate_mode, content, rendered_content):

View File

@ -33,7 +33,9 @@ from zerver.lib.bugdown import fenced_code
from zerver.lib.bugdown.fenced_code import FENCE_RE
from zerver.lib.camo import get_camo_url
from zerver.lib.timeout import timeout, TimeoutExpired
from zerver.lib.cache import cache_with_key, cache_get_many, cache_set_many
from zerver.lib.cache import (
cache_with_key, cache_get_many, cache_set_many, NotFoundInCache)
from zerver.lib.url_preview import preview as link_preview
from zerver.models import Message
import zerver.lib.alert_words as alert_words
import zerver.lib.mention as mention
@ -124,6 +126,35 @@ def add_a(root, url, link, height="", title=None, desc=None,
desc_div = markdown.util.etree.SubElement(summary_div, "desc")
desc_div.set("class", "message_inline_image_desc")
def add_embed(root, link, extracted_data):
# type: (Element, text_type, Dict[text_type, Any]) -> None
container = markdown.util.etree.SubElement(root, "div")
container.set("class", "message_embed")
title = extracted_data.get('title')
if title:
title_elm = markdown.util.etree.SubElement(container, "div")
title_elm.set("class", "message_embed_title")
a = markdown.util.etree.SubElement(title_elm, "a")
a.set("href", link)
a.set("target", "_blank")
a.set("title", title)
a.text = title
description = extracted_data.get('description')
if description:
description_elm = markdown.util.etree.SubElement(container, "div")
description_elm.set("class", "message_embed_description")
description_elm.text = description
img_link = extracted_data.get('image')
if img_link:
img = markdown.util.etree.SubElement(container, "img")
img.set("src", img_link)
img.set("class", "message_embed_image")
@cache_with_key(lambda tweet_id: tweet_id, cache_name="database", with_statsd_key="tweet_data")
def fetch_tweet_data(tweet_id):
# type: (Text) -> Optional[Dict[Text, Any]]
@ -577,6 +608,17 @@ class InlineInterestingLinkProcessor(markdown.treeprocessors.Treeprocessor):
add_a(root, youtube, url, None, None, None, "youtube-video message_inline_image", yt_id)
continue
if current_message is None or not settings.INLINE_URL_EMBED_PREVIEW:
continue
try:
extracted_data = link_preview.link_embed_data_from_cache(url)
except NotFoundInCache:
current_message.links_for_preview.add(url)
continue
if extracted_data:
add_embed(root, url, extracted_data)
class Avatar(markdown.inlinepatterns.Pattern):
def handleMatch(self, match):
# type: (Match[Text]) -> Optional[Element]

View File

@ -1,6 +1,7 @@
from __future__ import absolute_import
from __future__ import unicode_literals
from six import text_type
from typing import Any, Dict, Optional, Text
import ujson
@ -191,3 +192,10 @@ def twitter(tweet_id):
return ujson.loads(MEDIA_TWEET)
else:
return None
def fake_urlembed_data():
# type: () -> Dict[text_type, text_type]
return {
'title': 'Test',
'description': 'Short description'}

View File

@ -30,6 +30,10 @@ if False:
FuncT = TypeVar('FuncT', bound=Callable[..., Any])
class NotFoundInCache(Exception):
pass
remote_cache_time_start = 0.0
remote_cache_total_time = 0.0
remote_cache_total_requests = 0
@ -111,6 +115,28 @@ def get_cache_backend(cache_name):
return djcache
return caches[cache_name]
def get_cache_with_key(keyfunc, cache_name=None):
# type: (Any, Optional[str]) -> Any
"""
The main goal of this function getting value from the cache like in the "cache_with_key".
A cache value can contain any data including the "None", so
here used exception for case if value isn't found in the cache.
"""
def decorator(func):
# type: (Callable[..., Any]) -> (Callable[..., Any])
@wraps(func)
def func_with_caching(*args, **kwargs):
# type: (*Any, **Any) -> Callable[..., Any]
key = keyfunc(*args, **kwargs)
val = cache_get(key, cache_name=cache_name)
if val is not None:
return val[0]
raise NotFoundInCache()
return func_with_caching
return decorator
def cache_with_key(keyfunc, cache_name=None, timeout=None, with_statsd_key=None):
# type: (Any, Optional[str], Optional[int], Optional[str]) -> Any
# This function can't be typed perfectly because returning a generic function

View File

@ -291,6 +291,7 @@ def render_markdown(message, content, domain=None, realm_alert_words=None, messa
message.is_me_message = False
message.mentions_user_ids = set()
message.alert_words = set()
message.links_for_preview = set()
if not domain:
domain = message.sender.realm.domain

View File

View File

@ -0,0 +1,15 @@
from __future__ import absolute_import
from typing import Optional, Any
from six import text_type
from pyoembed import oEmbed, PyOembedException
def get_oembed_data(url, maxwidth=640, maxheight=480):
# type: (text_type, Optional[int], Optional[int]) -> Any
try:
data = oEmbed(url, maxwidth=maxwidth, maxheight=maxheight)
except PyOembedException:
return None
data['image'] = data.get('thumbnail_url')
return data

View File

@ -0,0 +1,4 @@
from zerver.lib.url_preview.parsers.open_graph import OpenGraphParser
from zerver.lib.url_preview.parsers.generic import GenericParser
__all__ = ['OpenGraphParser', 'GenericParser']

View File

@ -0,0 +1,14 @@
from __future__ import absolute_import
from typing import Any
from six import text_type
from bs4 import BeautifulSoup
class BaseParser(object):
def __init__(self, html_source):
# type: (text_type) -> None
self._soup = BeautifulSoup(html_source)
def extract_data(self):
# type: () -> Any
raise NotImplemented

View File

@ -0,0 +1,51 @@
from __future__ import absolute_import
from typing import Any, Dict
from zerver.lib.url_preview.parsers.base import BaseParser
class GenericParser(BaseParser):
def extract_data(self):
# type: () -> Dict
return {
'title': self._get_title(),
'description': self._get_description(),
'image': self._get_image()}
def _get_title(self):
# type: () -> Any
soup = self._soup
if (soup.title and soup.title.text != ''):
return soup.title.text
if (soup.h1 and soup.h1.text != ''):
return soup.h1.text
return None
def _get_description(self):
# type: () -> Any
soup = self._soup
meta_description = soup.find('meta', attrs={'name': 'description'})
if (meta_description and meta_description['content'] != ''):
return meta_description['content']
first_h1 = soup.find('h1')
if first_h1:
first_p = first_h1.find_next('p')
if (first_p and first_p.string != ''):
return first_p.text
first_p = soup.find('p')
if (first_p and first_p.string != ''):
return first_p.string
return None
def _get_image(self):
# type: () -> Any
"""
Finding a first image after the h1 header.
Presumably it will be the main image.
"""
soup = self._soup
first_h1 = soup.find('h1')
if first_h1:
first_image = first_h1.find_next_sibling('img')
if first_image and first_image['src'] != '':
return first_image['src']
return None

View File

@ -0,0 +1,16 @@
from __future__ import absolute_import
import re
from six import text_type
from typing import Dict
from .base import BaseParser
class OpenGraphParser(BaseParser):
def extract_data(self):
# type: () -> Dict[str, text_type]
meta = self._soup.findAll('meta')
content = {}
for tag in meta:
if tag.has_attr('property') and 'og:' in tag['property']:
content[re.sub('og:', '', tag['property'])] = tag['content']
return content

View File

@ -0,0 +1,70 @@
from __future__ import absolute_import
import re
import logging
import traceback
from six import text_type
from typing import Any, Optional
from typing.re import Match
import requests
from django.conf import settings
from zerver.lib.cache import cache_with_key, get_cache_with_key
from zerver.lib.bugdown import testing_mocks
from zerver.lib.url_preview.oembed import get_oembed_data
from zerver.lib.url_preview.parsers import OpenGraphParser, GenericParser
CACHE_NAME = "database"
# Based on django.core.validators.URLValidator, with ftp support removed.
link_regex = re.compile(
r'^(?:http)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def is_link(url):
# type: (text_type) -> Match[text_type]
return link_regex.match(str(url))
def cache_key_func(url):
# type: (text_type) -> text_type
return url
@cache_with_key(cache_key_func, cache_name=CACHE_NAME, with_statsd_key="urlpreview_data")
def get_link_embed_data(url, maxwidth=640, maxheight=480):
# type: (text_type, Optional[int], Optional[int]) -> Any
if not is_link(url):
return None
if settings.TEST_SUITE:
return testing_mocks.fake_urlembed_data()
# Fetch information from URL.
# We are using three sources in next order:
# 1. OEmbed
# 2. Open Graph
# 3. Meta tags
try:
data = get_oembed_data(url, maxwidth=maxwidth, maxheight=maxheight)
except requests.exceptions.RequestException:
msg = 'Unable to fetch information from url {0}, traceback: {1}'
logging.error(msg.format(url, traceback.format_exc()))
return None
data = data or {}
response = requests.get(url)
if response.ok:
og_data = OpenGraphParser(response.text).extract_data()
if og_data:
data.update(og_data)
generic_data = GenericParser(response.text).extract_data() or {}
for key in ['title', 'description', 'image']:
if not data.get(key) and generic_data.get(key):
data[key] = generic_data[key]
return data
@get_cache_with_key(cache_key_func, cache_name=CACHE_NAME)
def link_embed_data_from_cache(url, maxwidth=640, maxheight=480):
# type: (text_type, Optional[int], Optional[int]) -> Any
return

View File

@ -202,6 +202,7 @@ class BugdownTest(TestCase):
print("Running Bugdown Linkify tests")
self.maxDiff = None # type: Optional[int]
with mock.patch('zerver.lib.url_preview.preview.link_embed_data_from_cache', return_value=None):
for inline_url, reference, url in linkify_tests:
try:
match = replaced(reference, url, phrase=inline_url)
@ -272,7 +273,6 @@ class BugdownTest(TestCase):
# type: () -> None
# Don't fail on bad dropbox links
msg = "https://zulip-test.dropbox.com/photos/cl/ROmr9K1XYtmpneM"
with mock.patch('zerver.lib.bugdown.fetch_open_graph_image', return_value=None):
converted = bugdown_convert(msg)
self.assertEqual(converted, '<p><a href="https://zulip-test.dropbox.com/photos/cl/ROmr9K1XYtmpneM" target="_blank" title="https://zulip-test.dropbox.com/photos/cl/ROmr9K1XYtmpneM">https://zulip-test.dropbox.com/photos/cl/ROmr9K1XYtmpneM</a></p>')

View File

@ -0,0 +1,86 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import print_function
import mock
import ujson
from typing import Any
from django.test import override_settings
from zerver.lib.test_classes import ZulipTestCase
from zerver.lib.url_preview.oembed import get_oembed_data
from zerver.lib.url_preview.parsers import (
OpenGraphParser, GenericParser)
@override_settings(INLINE_URL_EMBED_PREVIEW=True)
class OembedTestCase(ZulipTestCase):
@mock.patch('pyoembed.requests.get')
def test_present_provider(self, get):
# type: (Any) -> None
get.return_value = response = mock.Mock()
response.headers = {'content-type': 'application/json'}
response.ok = True
response_data = {
'type': 'rich',
'thumbnail_url': 'https://scontent.cdninstagram.com/t51.2885-15/n.jpg',
'thumbnail_width': 640,
'thumbnail_height': 426,
'title': 'NASA',
'html': '<p>test</p>',
'version': '1.0',
'width': 658,
'height': None}
response.text = ujson.dumps(response_data)
url = 'http://instagram.com/p/BLtI2WdAymy'
data = get_oembed_data(url)
self.assertIsInstance(data, dict)
self.assertIn('title', data)
self.assertEqual(data['title'], response_data['title'])
@mock.patch('pyoembed.requests.get')
def test_error_request(self, get):
# type: (Any) -> None
get.return_value = response = mock.Mock()
response.ok = False
url = 'http://instagram.com/p/BLtI2WdAymy'
data = get_oembed_data(url)
self.assertIsNone(data)
class OpenGraphParserTestCase(ZulipTestCase):
def test_page_with_og(self):
# type: () -> None
html = """<html>
<head>
<meta property="og:title" content="The Rock" />
<meta property="og:type" content="video.movie" />
<meta property="og:url" content="http://www.imdb.com/title/tt0117500/" />
<meta property="og:image" content="http://ia.media-imdb.com/images/rock.jpg" />
<meta property="og:description" content="The Rock film" />
</head>
</html>"""
parser = OpenGraphParser(html)
result = parser.extract_data()
self.assertIn('title', result)
self.assertEqual(result['title'], 'The Rock')
self.assertEqual(result.get('description'), 'The Rock film')
class GenericParserTestCase(ZulipTestCase):
def test_parser(self):
# type: () -> None
html = """
<html>
<head><title>Test title</title></head>
<body>
<h1>Main header</h1>
<p>Description text</p>
</body>
</html>
"""
parser = GenericParser(html)
result = parser.extract_data()
self.assertEqual(result.get('title'), 'Test title')
self.assertEqual(result.get('description'), 'Description text')

View File

@ -21,6 +21,7 @@ from zerver.lib.actions import recipient_for_emails, do_update_message_flags, \
compute_mit_user_fullname, compute_irc_user_fullname, compute_jabber_user_fullname, \
create_mirror_user_if_needed, check_send_message, do_update_message, \
extract_recipients, truncate_body, render_incoming_message
from zerver.lib.queue import queue_json_publish
from zerver.lib.cache import (
generic_bulk_cached_fetch,
to_dict_cache_key_id,
@ -907,6 +908,7 @@ def update_message_backend(request, user_profile,
if subject == "":
raise JsonableError(_("Topic can't be empty"))
rendered_content = None
links_for_embed = set() # type: Set[text_type]
if content is not None:
content = content.strip()
if content == "":
@ -925,8 +927,15 @@ def update_message_backend(request, user_profile,
rendered_content = render_incoming_message(message,
content=content,
message_users=message_users)
links_for_embed |= message.links_for_preview
do_update_message(user_profile, message, subject, propagate_mode, content, rendered_content)
if links_for_embed and getattr(settings, 'INLINE_URL_EMBED_PREVIEW', None):
event_data = {
'message_id': message.id,
'message_content': message.content,
'urls': links_for_embed}
queue_json_publish('embed_links', event_data, lambda x: None)
return json_success()
@authenticated_json_post_view

View File

@ -5,7 +5,8 @@ from django.conf import settings
from django.core.handlers.wsgi import WSGIRequest
from django.core.handlers.base import BaseHandler
from zerver.models import get_user_profile_by_email, \
get_user_profile_by_id, get_prereg_user_by_email, get_client
get_user_profile_by_id, get_prereg_user_by_email, get_client, \
UserMessage, Message
from zerver.lib.context_managers import lockfile
from zerver.lib.queue import SimpleQueueClient, queue_json_publish
from zerver.lib.timestamp import timestamp_to_datetime
@ -14,7 +15,8 @@ from zerver.lib.notifications import handle_missedmessage_emails, enqueue_welcom
from zerver.lib.actions import do_send_confirmation_email, \
do_update_user_activity, do_update_user_activity_interval, do_update_user_presence, \
internal_send_message, check_send_message, extract_recipients, \
handle_push_notification
handle_push_notification, render_incoming_message, do_update_embedded_data
from zerver.lib.url_preview import preview as url_preview
from zerver.lib.digest import handle_digest_email
from zerver.lib.email_mirror import process_message as mirror_email
from zerver.decorator import JsonableError
@ -394,3 +396,27 @@ class TestWorker(QueueProcessingWorker):
logging.info("TestWorker should append this message to %s: %s" % (fn, message))
with open(fn, 'a') as f:
f.write(message + '\n')
@assign_queue('embed_links')
class FetchLinksEmbedData(QueueProcessingWorker):
def consume(self, event):
# type: (Mapping[str, Any]) -> None
for url in event['urls']:
url_preview.get_link_embed_data(url)
message = Message.objects.get(id=event['message_id'])
# If the message changed, we will run this task after updating the message
# in zerver.views.messages.update_message_backend
if message.content != event['message_content']:
return
if message.content is not None:
ums = UserMessage.objects.filter(
message=message.id).select_related("user_profile")
message_users = {um.user_profile for um in ums}
# If rendering fails, the called code will raise a JsonableError.
rendered_content = render_incoming_message(
message,
content=message.content,
message_users=message_users)
do_update_embedded_data(
message.sender, message, message.content, rendered_content)

View File

@ -38,3 +38,6 @@ SYSTEM_ONLY_REALMS = set() # type: Set[str]
USING_PGROONGA = True
# Flush cache after migration.
POST_MIGRATION_CACHE_FLUSHING = True # type: bool
# Enable inline open graph preview in development for now
INLINE_URL_EMBED_PREVIEW = True

View File

@ -182,6 +182,7 @@ DEFAULT_SETTINGS = {'TWITTER_CONSUMER_KEY': '',
'POST_MIGRATION_CACHE_FLUSHING': False,
'ENABLE_FILE_LINKS': False,
'USE_WEBSOCKETS': True,
'INLINE_URL_EMBED_PREVIEW': False,
}
for setting_name, setting_val in six.iteritems(DEFAULT_SETTINGS):

View File

@ -106,3 +106,5 @@ REALMS_HAVE_SUBDOMAINS = bool(os.getenv('REALMS_HAVE_SUBDOMAINS', False))
# Test Custom TOS template rendering
TERMS_OF_SERVICE = 'corporate/terms.md'
INLINE_URL_EMBED_PREVIEW = False