2020-10-30 01:21:40 +01:00
|
|
|
from typing import Mapping, Union
|
2019-04-24 04:10:56 +02:00
|
|
|
|
2019-04-24 02:50:25 +02:00
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
from django.http import HttpRequest
|
2019-04-24 03:37:34 +02:00
|
|
|
from django.utils.html import escape
|
2019-04-24 02:50:25 +02:00
|
|
|
|
|
|
|
from zerver.lib.cache import cache_with_key, open_graph_description_cache_key
|
|
|
|
|
2020-06-11 00:54:34 +02:00
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
def html_to_text(content: Union[str, bytes], tags: Mapping[str, str] = {"p": " | "}) -> str:
|
|
|
|
bs = BeautifulSoup(content, features="lxml")
|
2019-04-24 02:50:25 +02:00
|
|
|
# Skip any admonition (warning) blocks, since they're
|
|
|
|
# usually something about users needing to be an
|
|
|
|
# organization administrator, and not useful for
|
|
|
|
# describing the page.
|
2021-02-12 08:20:45 +01:00
|
|
|
for tag in bs.find_all("div", class_="admonition"):
|
2019-04-24 02:50:25 +02:00
|
|
|
tag.clear()
|
|
|
|
|
2023-08-30 00:58:01 +02:00
|
|
|
# Skip tabbed-sections, which just contain navigation instructions.
|
|
|
|
for tag in bs.find_all("div", class_="tabbed-section"):
|
2019-04-24 02:50:25 +02:00
|
|
|
tag.clear()
|
|
|
|
|
2021-02-12 08:20:45 +01:00
|
|
|
text = ""
|
2019-05-02 02:35:20 +02:00
|
|
|
for element in bs.find_all(tags.keys()):
|
|
|
|
# Ignore empty elements
|
|
|
|
if not element.text:
|
|
|
|
continue
|
2019-04-24 02:50:25 +02:00
|
|
|
# .text converts it from HTML to text
|
2019-05-02 02:35:20 +02:00
|
|
|
if text:
|
|
|
|
text += tags[element.name]
|
|
|
|
text += element.text
|
2019-04-24 02:50:25 +02:00
|
|
|
if len(text) > 500:
|
2019-04-24 03:37:34 +02:00
|
|
|
break
|
2021-02-12 08:20:45 +01:00
|
|
|
return escape(" ".join(text.split()))
|
2019-04-24 02:50:25 +02:00
|
|
|
|
2021-02-12 08:19:30 +01:00
|
|
|
|
|
|
|
@cache_with_key(open_graph_description_cache_key, timeout=3600 * 24)
|
2019-04-24 02:50:25 +02:00
|
|
|
def get_content_description(content: bytes, request: HttpRequest) -> str:
|
2020-10-30 01:21:40 +01:00
|
|
|
return html_to_text(content)
|