diff --git a/tools/lib/capitalization.py b/tools/lib/capitalization.py index 5d4394a65d..17afcd37f8 100644 --- a/tools/lib/capitalization.py +++ b/tools/lib/capitalization.py @@ -1,4 +1,5 @@ import re +from io import StringIO from re import Match from bs4 import BeautifulSoup @@ -182,7 +183,8 @@ IGNORED_PHRASES.sort(key=len, reverse=True) # text using BeautifulSoup and then removes extra whitespaces from # it. This step enables us to add HTML in our regexes directly. COMPILED_IGNORED_PHRASES = [ - re.compile(r" ".join(BeautifulSoup(regex, "lxml").text.split())) for regex in IGNORED_PHRASES + re.compile(r" ".join(BeautifulSoup(StringIO(regex), "lxml").text.split())) + for regex in IGNORED_PHRASES ] SPLIT_BOUNDARY = r"?.!" # Used to split string into sentences. @@ -241,7 +243,7 @@ def get_safe_text(text: str) -> str: This returns text which is rendered by BeautifulSoup and is in the form that can be split easily and has all IGNORED_PHRASES processed. """ - soup = BeautifulSoup(text, "lxml") + soup = BeautifulSoup(StringIO(text), "lxml") text = " ".join(soup.text.split()) # Remove extra whitespaces. for phrase_regex in COMPILED_IGNORED_PHRASES: text = phrase_regex.sub(replace_with_safe_phrase, text)