zulip/zerver/lib/url_preview/parsers/generic.py

from __future__ import absolute_import
from typing import Any, Dict
from zerver.lib.url_preview.parsers.base import BaseParser


class GenericParser(BaseParser):
    def extract_data(self):
        # type: () -> Dict
        return {
            'title': self._get_title(),
            'description': self._get_description(),
            'image': self._get_image()}

    def _get_title(self):
        # type: () -> Any
        soup = self._soup
        if (soup.title and soup.title.text != ''):
            return soup.title.text
        if (soup.h1 and soup.h1.text != ''):
            return soup.h1.text
        return None

    def _get_description(self):
        # type: () -> Any
        soup = self._soup
        meta_description = soup.find('meta', attrs={'name': 'description'})
        if (meta_description and meta_description['content'] != ''):
            return meta_description['content']
        first_h1 = soup.find('h1')
        if first_h1:
            first_p = first_h1.find_next('p')
            if (first_p and first_p.string != ''):
                return first_p.text
        first_p = soup.find('p')
        if (first_p and first_p.string != ''):
            return first_p.string
        return None

    def _get_image(self):
        # type: () -> Any
        """
        Finding a first image after the h1 header.
        Presumably it will be the main image.
        """
        soup = self._soup
        first_h1 = soup.find('h1')
        if first_h1:
            first_image = first_h1.find_next_sibling('img')
            if first_image and first_image['src'] != '':
                return first_image['src']
        return None