diff --git a/zerver/tests/test_narrow.py b/zerver/tests/test_narrow.py index 0a3380b818..5a9ab670f0 100644 --- a/zerver/tests/test_narrow.py +++ b/zerver/tests/test_narrow.py @@ -340,13 +340,13 @@ class NarrowBuilderTest(ZulipTestCase): @override_settings(USING_PGROONGA=False) def test_add_term_using_search_operator(self) -> None: term = dict(operator='search', operand='"french fries"') - self._do_add_term_test(term, 'WHERE (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND (search_tsvector @@ plainto_tsquery(%(param_2)s, %(param_3)s))') + self._do_add_term_test(term, 'WHERE (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND (search_tsvector @@ plainto_tsquery(%(param_4)s, %(param_5)s))') @override_settings(USING_PGROONGA=False) def test_add_term_using_search_operator_and_negated( self) -> None: # NEGATED term = dict(operator='search', operand='"french fries"', negated=True) - self._do_add_term_test(term, 'WHERE NOT (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND NOT (search_tsvector @@ plainto_tsquery(%(param_2)s, %(param_3)s))') + self._do_add_term_test(term, 'WHERE NOT (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND NOT (search_tsvector @@ plainto_tsquery(%(param_4)s, %(param_5)s))') @override_settings(USING_PGROONGA=True) def test_add_term_using_search_operator_pgroonga(self) -> None: @@ -2749,19 +2749,49 @@ recipient_id = %(recipient_id_3)s AND upper(subject) = upper(%(param_2)s))\ def test_get_messages_with_search_queries(self) -> None: query_ids = self.get_query_ids() - sql_template = "SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT message_id, flags, subject, rendered_content, ts_match_locs_array('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping')) AS content_matches, ts_match_locs_array('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping')) AS topic_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = {hamlet_id} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" # type: str + sql_template = """\ +SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\ +FROM (SELECT message_id, flags, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '') - 1] AS anon_2 \n\ +FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = , StopSel = '), '')) AS anon_3 \n\ + LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '') - 1] AS anon_4 \n\ +FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = , StopSel = '), '')) AS anon_5 \n\ + LIMIT ALL OFFSET 1)) AS topic_matches \n\ +FROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \n\ +WHERE user_profile_id = {hamlet_id} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY message_id ASC \n\ + LIMIT 10) AS anon_1 ORDER BY message_id ASC\ +""" sql = sql_template.format(**query_ids) self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9, 'narrow': '[["search", "jumping"]]'}, sql) - sql_template = "SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT id AS message_id, subject, rendered_content, ts_match_locs_array('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping')) AS content_matches, ts_match_locs_array('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping')) AS topic_matches \nFROM zerver_message \nWHERE recipient_id = {scotland_recipient} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY zerver_message.id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" + sql_template = """\ +SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\ +FROM (SELECT id AS message_id, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '') - 1] AS anon_2 \n\ +FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = , StopSel = '), '')) AS anon_3 \n\ + LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '') - 1] AS anon_4 \n\ +FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = , StopSel = '), '')) AS anon_5 \n\ + LIMIT ALL OFFSET 1)) AS topic_matches \n\ +FROM zerver_message \n\ +WHERE recipient_id = {scotland_recipient} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY zerver_message.id ASC \n\ + LIMIT 10) AS anon_1 ORDER BY message_id ASC\ +""" sql = sql_template.format(**query_ids) self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9, 'narrow': '[["stream", "Scotland"], ["search", "jumping"]]'}, sql) - sql_template = 'SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT message_id, flags, subject, rendered_content, ts_match_locs_array(\'zulip.english_us_search\', rendered_content, plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) AS content_matches, ts_match_locs_array(\'zulip.english_us_search\', escape_html(subject), plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) AS topic_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = {hamlet_id} AND (content ILIKE \'%jumping%\' OR subject ILIKE \'%jumping%\') AND (search_tsvector @@ plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC' + sql_template = """\ +SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\ +FROM (SELECT message_id, flags, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '') - 1] AS anon_2 \n\ +FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', '"jumping" quickly'), 'HighlightAll = TRUE, StartSel = , StopSel = '), '')) AS anon_3 \n\ + LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '') - 1] AS anon_4 \n\ +FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', '"jumping" quickly'), 'HighlightAll = TRUE, StartSel = , StopSel = '), '')) AS anon_5 \n\ + LIMIT ALL OFFSET 1)) AS topic_matches \n\ +FROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \n\ +WHERE user_profile_id = {hamlet_id} AND (content ILIKE '%jumping%' OR subject ILIKE '%jumping%') AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', '"jumping" quickly')) ORDER BY message_id ASC \n\ + LIMIT 10) AS anon_1 ORDER BY message_id ASC\ +""" sql = sql_template.format(**query_ids) self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9, 'narrow': '[["search", "\\"jumping\\" quickly"]]'}, diff --git a/zerver/views/messages.py b/zerver/views/messages.py index b81508c5a2..12f6eab149 100644 --- a/zerver/views/messages.py +++ b/zerver/views/messages.py @@ -56,6 +56,7 @@ from zerver.models import Message, UserProfile, Stream, Subscription, Client,\ get_user_by_id_in_realm_including_cross_realm, get_stream_recipient from sqlalchemy import func +from sqlalchemy.dialects import postgresql from sqlalchemy.sql import select, join, column, literal_column, literal, and_, \ or_, not_, union_all, alias, Selectable, ColumnElement, table @@ -86,6 +87,27 @@ ConditionTransform = Any OptionalNarrowListT = Optional[List[Dict[str, Any]]] +# These delimiters will not appear in rendered messages or HTML-escaped topics. +TS_START = "" +TS_STOP = "" + +def ts_locs_array( + config: ColumnElement, text: ColumnElement, tsquery: ColumnElement +) -> ColumnElement: + options = "HighlightAll = TRUE, StartSel = %s, StopSel = %s" % (TS_START, TS_STOP) + delimited = func.ts_headline(config, text, tsquery, options) + parts = func.unnest(func.string_to_array(delimited, TS_START)).alias() + part = column(parts.name) + part_len = func.length(part) - len(TS_STOP) + match_pos = func.sum(part_len).over(rows=(None, -1)) + len(TS_STOP) + match_len = func.strpos(part, TS_STOP) - 1 + return func.array( + select([postgresql.array([match_pos, match_len])]) + .select_from(parts) + .offset(1) + .as_scalar() + ) + # When you add a new operator to this, also update zerver/lib/narrow.py class NarrowBuilder: ''' @@ -430,7 +452,6 @@ class NarrowBuilder: def _by_search_tsearch(self, query: Query, operand: str, maybe_negate: ConditionTransform) -> Query: tsquery = func.plainto_tsquery(literal("zulip.english_us_search"), literal(operand)) - ts_locs_array = func.ts_match_locs_array query = query.column(ts_locs_array(literal("zulip.english_us_search"), column("rendered_content"), tsquery).label("content_matches")) @@ -454,9 +475,6 @@ class NarrowBuilder: cond = column("search_tsvector").op("@@")(tsquery) return query.where(maybe_negate(cond)) -# The offsets we get from PGroonga are counted in characters -# whereas the offsets from tsearch_extras are in bytes, so we -# have to account for both cases in the logic below. def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str: highlight_start = '' highlight_stop = '' @@ -464,24 +482,16 @@ def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str: result = '' in_tag = False - text_utf8 = text.encode('utf8') - for loc in locs: (offset, length) = loc - # These indexes are in byte space for tsearch, - # and they are in string space for pgroonga. prefix_start = pos prefix_end = offset match_start = offset match_end = offset + length - if settings.USING_PGROONGA: - prefix = text[prefix_start:prefix_end] - match = text[match_start:match_end] - else: - prefix = text_utf8[prefix_start:prefix_end].decode() - match = text_utf8[match_start:match_end].decode() + prefix = text[prefix_start:prefix_end] + match = text[match_start:match_end] for character in (prefix + match): if character == '<': @@ -498,12 +508,7 @@ def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str: result += highlight_stop pos = match_end - if settings.USING_PGROONGA: - final_frag = text[pos:] - else: - final_frag = text_utf8[pos:].decode() - - result += final_frag + result += text[pos:] return result def get_search_fields(rendered_content: str, topic_name: str, content_matches: Iterable[Tuple[int, int]],