narrow: Fix topic highlighting issue with apostrophes in search results.

This commit addresses the issue where the topic highlighting
in search results was offset by one character when an
apostrophe was present. The problem stemmed from the disparity
in HTML escaping generated by the function `func.escape_html` which
is used to obtain `topic_matches` differs from the escaping performed
by the function `django.utils.html.escape` for apostrophes (').

func.escape_html | django.utils.html.escape
-----------------+--------------------------
      '      |           '

To fix this SQL query is changed to return the HTML-escaped
topic name generated by the function `func.escape_html`.

Fixes: #25633.
This commit is contained in:
Akshat 2023-05-29 23:33:35 +05:30 committed by Tim Abbott
parent 446aea41b3
commit baede93f69
3 changed files with 76 additions and 12 deletions

View File

@ -974,7 +974,27 @@ def add_narrow_conditions(
if search_operands:
is_search = True
query = query.add_columns(topic_column_sa(), column("rendered_content", Text))
query = query.add_columns(
# This topic escaping logic ensures consistent escaping of topic names throughout
# the system, ensuring accuracy in string highlighting and avoiding any discrepancies.
#
# When a topic name is fetched from the database, it goes through this logic.
# The `func.escape_html()` function is used to escape the topic name, ensuring that
# special characters are properly escaped. This helps to avoid the need to apply other
# escaping logic to the topic name for string highlighting purposes. As a result, the
# highlighted string will accurately match the actual topic name displayed in the UI.
# This approach prevents any inconsistencies or offsets that could occur if different
# escaping functions were used.
#
# It's important to note that the `process_fts_updates` script, responsible for
# updating the relevant columns in the database, also utilizes the same escaping
# logic. This alignment ensures that the escaped topic names stored in the database
# and the topic names used during string highlighting are in sync. Therefore, there
# is no need for any special handling in `process_fts_updates` to align with this
# escaping logic.
func.escape_html(topic_column_sa(), type_=Text).label("escaped_topic_name"),
column("rendered_content", Text),
)
search_term = dict(
operator="search",
operand=" ".join(search_operands),

View File

@ -2536,6 +2536,7 @@ class GetOldMessagesTest(ZulipTestCase):
("日本", "今朝はごはんを食べました。"),
("日本", "昨日、日本 のお菓子を送りました。"),
("english", "I want to go to 日本!"),
("McDonald'sBurger", "McDonald'sBurger"),
]
next_message_id = self.get_last_message().id + 1
@ -2663,6 +2664,28 @@ class GetOldMessagesTest(ZulipTestCase):
'<p>こんに <span class="highlight">ちは</span> 。 <span class="highlight">今日は</span> いい 天気ですね。</p>',
)
# Search operands with HTML special characters
special_search_narrow = [
dict(operator="search", operand="sBurger"),
]
special_search_result = self.get_and_check_messages(
dict(
narrow=orjson.dumps(special_search_narrow).decode(),
anchor=next_message_id,
num_after=10,
num_before=0,
)
)
self.assert_length(special_search_result["messages"], 1)
self.assertEqual(
special_search_result["messages"][0][MATCH_TOPIC],
'McDonald&#39;<span class="highlight">sBurger</span>',
)
self.assertEqual(
special_search_result["messages"][0]["match_content"],
'<p>McDonald\'<span class="highlight">sBurger</span></p>',
)
@override_settings(USING_PGROONGA=False)
def test_get_visible_messages_with_search(self) -> None:
self.login("hamlet")
@ -2741,6 +2764,7 @@ class GetOldMessagesTest(ZulipTestCase):
("english", "https://domain.com/path/to.something-I,want/"),
("english", "foo.cht"),
("bread & butter", "chalk & cheese"),
("McDonald'sBurger", "McDonald'sBurger"),
]
for topic, content in messages_to_search:
@ -2947,6 +2971,27 @@ class GetOldMessagesTest(ZulipTestCase):
'<p>chalk <span class="highlight">&amp;</span> cheese</p>',
)
special_search_narrow = [
dict(operator="search", operand="sBurger"),
]
special_search_result = self.get_and_check_messages(
dict(
narrow=orjson.dumps(special_search_narrow).decode(),
anchor=next_message_id,
num_after=10,
num_before=0,
)
)
self.assert_length(special_search_result["messages"], 1)
self.assertEqual(
special_search_result["messages"][0][MATCH_TOPIC],
'McDonald&#39;<span class="highlight">sBurger</span>',
)
self.assertEqual(
special_search_result["messages"][0]["match_content"],
'<p>McDonald\'<span class="highlight">sBurger</span></p>',
)
def test_messages_in_narrow_for_non_search(self) -> None:
user = self.example_user("cordelia")
self.login_user(user)
@ -4072,8 +4117,8 @@ recipient_id = %(recipient_id_3)s AND upper(subject) = upper(%(param_2)s))\
query_ids = self.get_query_ids()
sql_template = """\
SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT message_id, flags, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
SELECT anon_1.message_id, anon_1.flags, anon_1.escaped_topic_name, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT message_id, flags, escape_html(subject) AS escaped_topic_name, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3\n\
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5\n\
@ -4088,8 +4133,8 @@ WHERE user_profile_id = {hamlet_id} AND (search_tsvector @@ plainto_tsquery('zul
)
sql_template = """\
SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT id AS message_id, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
SELECT anon_1.message_id, anon_1.escaped_topic_name, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT id AS message_id, escape_html(subject) AS escaped_topic_name, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3\n\
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5\n\
@ -4110,8 +4155,8 @@ WHERE recipient_id = {scotland_recipient} AND (search_tsvector @@ plainto_tsquer
)
sql_template = """\
SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT message_id, flags, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
SELECT anon_1.message_id, anon_1.flags, anon_1.escaped_topic_name, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT message_id, flags, escape_html(subject) AS escaped_topic_name, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', '"jumping" quickly'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3\n\
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', '"jumping" quickly'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5\n\

View File

@ -2,7 +2,6 @@ from typing import Dict, Iterable, List, Optional, Tuple, Union
from django.contrib.auth.models import AnonymousUser
from django.http import HttpRequest, HttpResponse
from django.utils.html import escape as escape_html
from django.utils.translation import gettext as _
from sqlalchemy.sql import and_, column, join, literal, literal_column, select, table
from sqlalchemy.types import Integer, Text
@ -68,13 +67,13 @@ def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str:
def get_search_fields(
rendered_content: str,
topic_name: str,
escaped_topic_name: str,
content_matches: Iterable[Tuple[int, int]],
topic_matches: Iterable[Tuple[int, int]],
) -> Dict[str, str]:
return {
"match_content": highlight_string(rendered_content, content_matches),
MATCH_TOPIC: highlight_string(escape_html(topic_name), topic_matches),
MATCH_TOPIC: highlight_string(escaped_topic_name, topic_matches),
}
@ -207,9 +206,9 @@ def get_messages_backend(
if is_search:
for row in rows:
message_id = row[0]
(topic_name, rendered_content, content_matches, topic_matches) = row[-4:]
(escaped_topic_name, rendered_content, content_matches, topic_matches) = row[-4:]
search_fields[message_id] = get_search_fields(
rendered_content, topic_name, content_matches, topic_matches
rendered_content, escaped_topic_name, content_matches, topic_matches
)
message_list = messages_for_ids(