2018-05-10 19:13:36 +02:00
|
|
|
from typing import Dict, Optional
|
2016-10-27 12:06:44 +02:00
|
|
|
from zerver.lib.url_preview.parsers.base import BaseParser
|
|
|
|
|
|
|
|
|
|
|
|
class GenericParser(BaseParser):
|
2018-05-10 19:13:36 +02:00
|
|
|
def extract_data(self) -> Dict[str, Optional[str]]:
|
2016-10-27 12:06:44 +02:00
|
|
|
return {
|
|
|
|
'title': self._get_title(),
|
|
|
|
'description': self._get_description(),
|
|
|
|
'image': self._get_image()}
|
|
|
|
|
2018-05-10 19:13:36 +02:00
|
|
|
def _get_title(self) -> Optional[str]:
|
2016-10-27 12:06:44 +02:00
|
|
|
soup = self._soup
|
|
|
|
if (soup.title and soup.title.text != ''):
|
|
|
|
return soup.title.text
|
|
|
|
if (soup.h1 and soup.h1.text != ''):
|
|
|
|
return soup.h1.text
|
|
|
|
return None
|
|
|
|
|
2018-05-10 19:13:36 +02:00
|
|
|
def _get_description(self) -> Optional[str]:
|
2016-10-27 12:06:44 +02:00
|
|
|
soup = self._soup
|
|
|
|
meta_description = soup.find('meta', attrs={'name': 'description'})
|
2018-05-17 21:40:43 +02:00
|
|
|
if (meta_description and meta_description.get('content', '') != ''):
|
2016-10-27 12:06:44 +02:00
|
|
|
return meta_description['content']
|
|
|
|
first_h1 = soup.find('h1')
|
|
|
|
if first_h1:
|
|
|
|
first_p = first_h1.find_next('p')
|
|
|
|
if (first_p and first_p.string != ''):
|
|
|
|
return first_p.text
|
|
|
|
first_p = soup.find('p')
|
|
|
|
if (first_p and first_p.string != ''):
|
|
|
|
return first_p.string
|
|
|
|
return None
|
|
|
|
|
2018-05-10 19:13:36 +02:00
|
|
|
def _get_image(self) -> Optional[str]:
|
2016-10-27 12:06:44 +02:00
|
|
|
"""
|
|
|
|
Finding a first image after the h1 header.
|
|
|
|
Presumably it will be the main image.
|
|
|
|
"""
|
|
|
|
soup = self._soup
|
|
|
|
first_h1 = soup.find('h1')
|
|
|
|
if first_h1:
|
|
|
|
first_image = first_h1.find_next_sibling('img')
|
|
|
|
if first_image and first_image['src'] != '':
|
|
|
|
return first_image['src']
|
|
|
|
return None
|