capitalization: Avoid bs4.MarkupResemblesLocatorWarning.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
This commit is contained in:
Anders Kaseorg 2024-10-22 21:21:51 -07:00 committed by Tim Abbott
parent ec437fb770
commit 14db6e8c14
1 changed files with 4 additions and 2 deletions

View File

@ -1,4 +1,5 @@
import re import re
from io import StringIO
from re import Match from re import Match
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
@ -182,7 +183,8 @@ IGNORED_PHRASES.sort(key=len, reverse=True)
# text using BeautifulSoup and then removes extra whitespaces from # text using BeautifulSoup and then removes extra whitespaces from
# it. This step enables us to add HTML in our regexes directly. # it. This step enables us to add HTML in our regexes directly.
COMPILED_IGNORED_PHRASES = [ COMPILED_IGNORED_PHRASES = [
re.compile(r" ".join(BeautifulSoup(regex, "lxml").text.split())) for regex in IGNORED_PHRASES re.compile(r" ".join(BeautifulSoup(StringIO(regex), "lxml").text.split()))
for regex in IGNORED_PHRASES
] ]
SPLIT_BOUNDARY = r"?.!" # Used to split string into sentences. SPLIT_BOUNDARY = r"?.!" # Used to split string into sentences.
@ -241,7 +243,7 @@ def get_safe_text(text: str) -> str:
This returns text which is rendered by BeautifulSoup and is in the This returns text which is rendered by BeautifulSoup and is in the
form that can be split easily and has all IGNORED_PHRASES processed. form that can be split easily and has all IGNORED_PHRASES processed.
""" """
soup = BeautifulSoup(text, "lxml") soup = BeautifulSoup(StringIO(text), "lxml")
text = " ".join(soup.text.split()) # Remove extra whitespaces. text = " ".join(soup.text.split()) # Remove extra whitespaces.
for phrase_regex in COMPILED_IGNORED_PHRASES: for phrase_regex in COMPILED_IGNORED_PHRASES:
text = phrase_regex.sub(replace_with_safe_phrase, text) text = phrase_regex.sub(replace_with_safe_phrase, text)