From 14db6e8c147e94fa0316b3c97ad13dbb327ef802 Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <anders@zulip.com>
Date: Tue, 22 Oct 2024 21:21:51 -0700
Subject: [PATCH] capitalization: Avoid bs4.MarkupResemblesLocatorWarning.

Signed-off-by: Anders Kaseorg <anders@zulip.com>
---
 tools/lib/capitalization.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/lib/capitalization.py b/tools/lib/capitalization.py
index 5d4394a65d..17afcd37f8 100644
--- a/tools/lib/capitalization.py
+++ b/tools/lib/capitalization.py
@@ -1,4 +1,5 @@
 import re
+from io import StringIO
 from re import Match
 
 from bs4 import BeautifulSoup
@@ -182,7 +183,8 @@ IGNORED_PHRASES.sort(key=len, reverse=True)
 # text using BeautifulSoup and then removes extra whitespaces from
 # it. This step enables us to add HTML in our regexes directly.
 COMPILED_IGNORED_PHRASES = [
-    re.compile(r" ".join(BeautifulSoup(regex, "lxml").text.split())) for regex in IGNORED_PHRASES
+    re.compile(r" ".join(BeautifulSoup(StringIO(regex), "lxml").text.split()))
+    for regex in IGNORED_PHRASES
 ]
 
 SPLIT_BOUNDARY = r"?.!"  # Used to split string into sentences.
@@ -241,7 +243,7 @@ def get_safe_text(text: str) -> str:
     This returns text which is rendered by BeautifulSoup and is in the
     form that can be split easily and has all IGNORED_PHRASES processed.
     """
-    soup = BeautifulSoup(text, "lxml")
+    soup = BeautifulSoup(StringIO(text), "lxml")
     text = " ".join(soup.text.split())  # Remove extra whitespaces.
     for phrase_regex in COMPILED_IGNORED_PHRASES:
         text = phrase_regex.sub(replace_with_safe_phrase, text)