2019-05-02 18:58:39 +02:00
|
|
|
from bs4 import BeautifulSoup, SoupStrainer
|
2018-05-10 19:13:36 +02:00
|
|
|
from typing import Optional, Dict, Any
|
2016-10-27 12:06:44 +02:00
|
|
|
from pyoembed import oEmbed, PyOembedException
|
|
|
|
|
2018-05-10 19:13:36 +02:00
|
|
|
def get_oembed_data(url: str,
|
2017-11-05 11:15:10 +01:00
|
|
|
maxwidth: Optional[int]=640,
|
2018-06-16 23:00:17 +02:00
|
|
|
maxheight: Optional[int]=480) -> Optional[Dict[str, Any]]:
|
2016-10-27 12:06:44 +02:00
|
|
|
try:
|
|
|
|
data = oEmbed(url, maxwidth=maxwidth, maxheight=maxheight)
|
|
|
|
except PyOembedException:
|
|
|
|
return None
|
|
|
|
|
2019-06-01 13:05:30 +02:00
|
|
|
oembed_resource_type = data.get('type', '')
|
2019-05-26 06:27:01 +02:00
|
|
|
image = data.get('url', data.get('image'))
|
2019-05-02 18:58:39 +02:00
|
|
|
thumbnail = data.get('thumbnail_url')
|
|
|
|
html = data.pop('html', '')
|
2019-06-01 13:05:30 +02:00
|
|
|
if oembed_resource_type == 'photo' and image:
|
2019-05-02 18:58:39 +02:00
|
|
|
data['image'] = image
|
|
|
|
# Add a key to identify oembed metadata as opposed to other metadata
|
|
|
|
data['oembed'] = True
|
|
|
|
|
2019-06-01 13:05:30 +02:00
|
|
|
elif oembed_resource_type == 'video' and html and thumbnail:
|
2019-05-02 18:58:39 +02:00
|
|
|
data['html'] = get_safe_html(html)
|
|
|
|
data['image'] = thumbnail
|
2019-05-26 06:27:01 +02:00
|
|
|
# Add a key to identify oembed metadata as opposed to other metadata
|
|
|
|
data['oembed'] = True
|
|
|
|
|
2016-10-27 12:06:44 +02:00
|
|
|
return data
|
2019-05-02 18:58:39 +02:00
|
|
|
|
|
|
|
def get_safe_html(html: str) -> str:
|
|
|
|
"""Return a safe version of the oEmbed html.
|
|
|
|
|
|
|
|
Verify that the HTML:
|
|
|
|
1. has a single iframe
|
|
|
|
2. the src uses a schema relative URL or explicitly specifies http(s)
|
|
|
|
|
|
|
|
"""
|
|
|
|
if html.startswith('<![CDATA[') and html.endswith(']]>'):
|
|
|
|
html = html[9:-3]
|
|
|
|
soup = BeautifulSoup(html, 'lxml', parse_only=SoupStrainer('iframe'))
|
|
|
|
iframe = soup.find('iframe')
|
|
|
|
if iframe is not None and iframe.get('src').startswith(('http://', 'https://', '//')):
|
|
|
|
return str(soup)
|
|
|
|
return ''
|