2024-10-22 01:40:24 +02:00
|
|
|
from email.message import EmailMessage
|
2022-04-14 21:52:41 +02:00
|
|
|
|
|
|
|
from zerver.lib.url_preview.types import UrlEmbedData
|
2016-10-27 12:06:44 +02:00
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
|
2017-11-05 11:37:41 +01:00
|
|
|
class BaseParser:
|
2024-07-12 02:30:23 +02:00
|
|
|
def __init__(self, html_source: bytes, content_type: str | None) -> None:
|
2018-08-08 22:24:20 +02:00
|
|
|
# We import BeautifulSoup here, because it's not used by most
|
|
|
|
# processes in production, and bs4 is big enough that
|
|
|
|
# importing it adds 10s of milliseconds to manage.py startup.
|
|
|
|
from bs4 import BeautifulSoup
|
2021-02-12 08:19:30 +01:00
|
|
|
|
2024-10-22 01:40:24 +02:00
|
|
|
m = EmailMessage()
|
|
|
|
m["Content-Type"] = content_type
|
|
|
|
charset = m.get_content_charset()
|
2020-12-08 04:26:30 +01:00
|
|
|
self._soup = BeautifulSoup(html_source, "lxml", from_encoding=charset)
|
2016-10-27 12:06:44 +02:00
|
|
|
|
2022-04-14 21:52:41 +02:00
|
|
|
def extract_data(self) -> UrlEmbedData:
|
2023-02-04 02:07:20 +01:00
|
|
|
raise NotImplementedError
|