2023-12-05 21:25:00 +01:00
|
|
|
from urllib.parse import urlsplit
|
2020-06-11 00:54:34 +02:00
|
|
|
|
2022-01-22 07:31:33 +01:00
|
|
|
from bs4.element import Tag
|
2023-10-12 19:43:45 +02:00
|
|
|
from typing_extensions import override
|
2022-01-22 07:31:33 +01:00
|
|
|
|
2016-10-27 12:06:44 +02:00
|
|
|
from zerver.lib.url_preview.parsers.base import BaseParser
|
2022-04-14 21:52:41 +02:00
|
|
|
from zerver.lib.url_preview.types import UrlEmbedData
|
2016-10-27 12:06:44 +02:00
|
|
|
|
|
|
|
|
|
|
|
class GenericParser(BaseParser):
|
2023-10-12 19:43:45 +02:00
|
|
|
@override
|
2022-04-14 21:52:41 +02:00
|
|
|
def extract_data(self) -> UrlEmbedData:
|
|
|
|
return UrlEmbedData(
|
|
|
|
title=self._get_title(),
|
|
|
|
description=self._get_description(),
|
|
|
|
image=self._get_image(),
|
|
|
|
)
|
2016-10-27 12:06:44 +02:00
|
|
|
|
2024-07-12 02:30:23 +02:00
|
|
|
def _get_title(self) -> str | None:
|
2016-10-27 12:06:44 +02:00
|
|
|
soup = self._soup
|
2021-02-12 08:20:45 +01:00
|
|
|
if soup.title and soup.title.text != "":
|
2016-10-27 12:06:44 +02:00
|
|
|
return soup.title.text
|
2021-02-12 08:20:45 +01:00
|
|
|
if soup.h1 and soup.h1.text != "":
|
2016-10-27 12:06:44 +02:00
|
|
|
return soup.h1.text
|
|
|
|
return None
|
|
|
|
|
2024-07-12 02:30:23 +02:00
|
|
|
def _get_description(self) -> str | None:
|
2016-10-27 12:06:44 +02:00
|
|
|
soup = self._soup
|
2021-02-12 08:20:45 +01:00
|
|
|
meta_description = soup.find("meta", attrs={"name": "description"})
|
2022-01-22 07:31:33 +01:00
|
|
|
if isinstance(meta_description, Tag) and meta_description.get("content", "") != "":
|
|
|
|
assert isinstance(meta_description["content"], str)
|
2021-02-12 08:20:45 +01:00
|
|
|
return meta_description["content"]
|
|
|
|
first_h1 = soup.find("h1")
|
2016-10-27 12:06:44 +02:00
|
|
|
if first_h1:
|
2021-02-12 08:20:45 +01:00
|
|
|
first_p = first_h1.find_next("p")
|
|
|
|
if first_p and first_p.text != "":
|
2016-10-27 12:06:44 +02:00
|
|
|
return first_p.text
|
2021-02-12 08:20:45 +01:00
|
|
|
first_p = soup.find("p")
|
|
|
|
if first_p and first_p.text != "":
|
2019-05-05 20:15:00 +02:00
|
|
|
return first_p.text
|
2016-10-27 12:06:44 +02:00
|
|
|
return None
|
|
|
|
|
2024-07-12 02:30:23 +02:00
|
|
|
def _get_image(self) -> str | None:
|
2016-10-27 12:06:44 +02:00
|
|
|
"""
|
|
|
|
Finding a first image after the h1 header.
|
|
|
|
Presumably it will be the main image.
|
|
|
|
"""
|
|
|
|
soup = self._soup
|
2021-02-12 08:20:45 +01:00
|
|
|
first_h1 = soup.find("h1")
|
2016-10-27 12:06:44 +02:00
|
|
|
if first_h1:
|
2021-02-12 08:20:45 +01:00
|
|
|
first_image = first_h1.find_next_sibling("img", src=True)
|
2022-01-22 07:31:33 +01:00
|
|
|
if isinstance(first_image, Tag) and first_image["src"] != "":
|
|
|
|
assert isinstance(first_image["src"], str)
|
2022-02-18 22:48:53 +01:00
|
|
|
try:
|
2023-12-05 21:25:00 +01:00
|
|
|
# We use urlsplit and not URLValidator because we
|
2022-02-18 22:48:53 +01:00
|
|
|
# need to support relative URLs.
|
2023-12-05 21:25:00 +01:00
|
|
|
urlsplit(first_image["src"])
|
2022-02-18 22:48:53 +01:00
|
|
|
except ValueError:
|
|
|
|
return None
|
2021-02-12 08:20:45 +01:00
|
|
|
return first_image["src"]
|
2016-10-27 12:06:44 +02:00
|
|
|
return None
|