zulip/zerver/lib/html_to_text.py

from bs4 import BeautifulSoup
from django.http import HttpRequest
from django.utils.html import escape

from zerver.lib.cache import cache_with_key, open_graph_description_cache_key

def html_to_text(content: str) -> str:
    bs = BeautifulSoup(content, features='lxml')
    # Skip any admonition (warning) blocks, since they're
    # usually something about users needing to be an
    # organization administrator, and not useful for
    # describing the page.
    for tag in bs.find_all('div', class_="admonition"):
        tag.clear()

    # Skip code-sections, which just contains navigation instructions.
    for tag in bs.find_all('div', class_="code-section"):
        tag.clear()

    text = ''
    for paragraph in bs.find_all('p'):
        # .text converts it from HTML to text
        text = text + paragraph.text + ' '
        if len(text) > 500:
            break
    return escape(' '.join(text.split()))

@cache_with_key(open_graph_description_cache_key, timeout=3600*24)
def get_content_description(content: bytes, request: HttpRequest) -> str:
    str_content = content.decode("utf-8")
    return html_to_text(str_content)
html_to_text: Extract code for html to plain text conversion. 2019-04-24 02:50:25 +02:00			`from bs4 import BeautifulSoup`
			`from django.http import HttpRequest`
html_to_text: Escape text when using as description. 2019-04-24 03:37:34 +02:00			`from django.utils.html import escape`
html_to_text: Extract code for html to plain text conversion. 2019-04-24 02:50:25 +02:00
			`from zerver.lib.cache import cache_with_key, open_graph_description_cache_key`

			`def html_to_text(content: str) -> str:`
			`bs = BeautifulSoup(content, features='lxml')`
			`# Skip any admonition (warning) blocks, since they're`
			`# usually something about users needing to be an`
			`# organization administrator, and not useful for`
			`# describing the page.`
			`for tag in bs.find_all('div', class_="admonition"):`
			`tag.clear()`

			`# Skip code-sections, which just contains navigation instructions.`
			`for tag in bs.find_all('div', class_="code-section"):`
			`tag.clear()`

			`text = ''`
			`for paragraph in bs.find_all('p'):`
			`# .text converts it from HTML to text`
			`text = text + paragraph.text + ' '`
			`if len(text) > 500:`
html_to_text: Escape text when using as description. 2019-04-24 03:37:34 +02:00			`break`
			`return escape(' '.join(text.split()))`
html_to_text: Extract code for html to plain text conversion. 2019-04-24 02:50:25 +02:00
			`@cache_with_key(open_graph_description_cache_key, timeout=3600*24)`
			`def get_content_description(content: bytes, request: HttpRequest) -> str:`
			`str_content = content.decode("utf-8")`
			`return html_to_text(str_content)`