mirror of https://github.com/zulip/zulip.git
search: Reimplement ts_locs_array in pure PostgreSQL.
Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
This commit is contained in:
parent
d2d4800b1c
commit
c3e395b7d8
|
@ -340,13 +340,13 @@ class NarrowBuilderTest(ZulipTestCase):
|
|||
@override_settings(USING_PGROONGA=False)
|
||||
def test_add_term_using_search_operator(self) -> None:
|
||||
term = dict(operator='search', operand='"french fries"')
|
||||
self._do_add_term_test(term, 'WHERE (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND (search_tsvector @@ plainto_tsquery(%(param_2)s, %(param_3)s))')
|
||||
self._do_add_term_test(term, 'WHERE (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND (search_tsvector @@ plainto_tsquery(%(param_4)s, %(param_5)s))')
|
||||
|
||||
@override_settings(USING_PGROONGA=False)
|
||||
def test_add_term_using_search_operator_and_negated(
|
||||
self) -> None: # NEGATED
|
||||
term = dict(operator='search', operand='"french fries"', negated=True)
|
||||
self._do_add_term_test(term, 'WHERE NOT (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND NOT (search_tsvector @@ plainto_tsquery(%(param_2)s, %(param_3)s))')
|
||||
self._do_add_term_test(term, 'WHERE NOT (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND NOT (search_tsvector @@ plainto_tsquery(%(param_4)s, %(param_5)s))')
|
||||
|
||||
@override_settings(USING_PGROONGA=True)
|
||||
def test_add_term_using_search_operator_pgroonga(self) -> None:
|
||||
|
@ -2749,19 +2749,49 @@ recipient_id = %(recipient_id_3)s AND upper(subject) = upper(%(param_2)s))\
|
|||
def test_get_messages_with_search_queries(self) -> None:
|
||||
query_ids = self.get_query_ids()
|
||||
|
||||
sql_template = "SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT message_id, flags, subject, rendered_content, ts_match_locs_array('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping')) AS content_matches, ts_match_locs_array('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping')) AS topic_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = {hamlet_id} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" # type: str
|
||||
sql_template = """\
|
||||
SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
|
||||
FROM (SELECT message_id, flags, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
|
||||
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3 \n\
|
||||
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
|
||||
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5 \n\
|
||||
LIMIT ALL OFFSET 1)) AS topic_matches \n\
|
||||
FROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \n\
|
||||
WHERE user_profile_id = {hamlet_id} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY message_id ASC \n\
|
||||
LIMIT 10) AS anon_1 ORDER BY message_id ASC\
|
||||
"""
|
||||
sql = sql_template.format(**query_ids)
|
||||
self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9,
|
||||
'narrow': '[["search", "jumping"]]'},
|
||||
sql)
|
||||
|
||||
sql_template = "SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT id AS message_id, subject, rendered_content, ts_match_locs_array('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping')) AS content_matches, ts_match_locs_array('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping')) AS topic_matches \nFROM zerver_message \nWHERE recipient_id = {scotland_recipient} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY zerver_message.id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC"
|
||||
sql_template = """\
|
||||
SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
|
||||
FROM (SELECT id AS message_id, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
|
||||
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3 \n\
|
||||
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
|
||||
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5 \n\
|
||||
LIMIT ALL OFFSET 1)) AS topic_matches \n\
|
||||
FROM zerver_message \n\
|
||||
WHERE recipient_id = {scotland_recipient} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY zerver_message.id ASC \n\
|
||||
LIMIT 10) AS anon_1 ORDER BY message_id ASC\
|
||||
"""
|
||||
sql = sql_template.format(**query_ids)
|
||||
self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9,
|
||||
'narrow': '[["stream", "Scotland"], ["search", "jumping"]]'},
|
||||
sql)
|
||||
|
||||
sql_template = 'SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT message_id, flags, subject, rendered_content, ts_match_locs_array(\'zulip.english_us_search\', rendered_content, plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) AS content_matches, ts_match_locs_array(\'zulip.english_us_search\', escape_html(subject), plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) AS topic_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = {hamlet_id} AND (content ILIKE \'%jumping%\' OR subject ILIKE \'%jumping%\') AND (search_tsvector @@ plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC'
|
||||
sql_template = """\
|
||||
SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
|
||||
FROM (SELECT message_id, flags, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
|
||||
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', '"jumping" quickly'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3 \n\
|
||||
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
|
||||
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', '"jumping" quickly'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5 \n\
|
||||
LIMIT ALL OFFSET 1)) AS topic_matches \n\
|
||||
FROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \n\
|
||||
WHERE user_profile_id = {hamlet_id} AND (content ILIKE '%jumping%' OR subject ILIKE '%jumping%') AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', '"jumping" quickly')) ORDER BY message_id ASC \n\
|
||||
LIMIT 10) AS anon_1 ORDER BY message_id ASC\
|
||||
"""
|
||||
sql = sql_template.format(**query_ids)
|
||||
self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9,
|
||||
'narrow': '[["search", "\\"jumping\\" quickly"]]'},
|
||||
|
|
|
@ -56,6 +56,7 @@ from zerver.models import Message, UserProfile, Stream, Subscription, Client,\
|
|||
get_user_by_id_in_realm_including_cross_realm, get_stream_recipient
|
||||
|
||||
from sqlalchemy import func
|
||||
from sqlalchemy.dialects import postgresql
|
||||
from sqlalchemy.sql import select, join, column, literal_column, literal, and_, \
|
||||
or_, not_, union_all, alias, Selectable, ColumnElement, table
|
||||
|
||||
|
@ -86,6 +87,27 @@ ConditionTransform = Any
|
|||
|
||||
OptionalNarrowListT = Optional[List[Dict[str, Any]]]
|
||||
|
||||
# These delimiters will not appear in rendered messages or HTML-escaped topics.
|
||||
TS_START = "<ts-match>"
|
||||
TS_STOP = "</ts-match>"
|
||||
|
||||
def ts_locs_array(
|
||||
config: ColumnElement, text: ColumnElement, tsquery: ColumnElement
|
||||
) -> ColumnElement:
|
||||
options = "HighlightAll = TRUE, StartSel = %s, StopSel = %s" % (TS_START, TS_STOP)
|
||||
delimited = func.ts_headline(config, text, tsquery, options)
|
||||
parts = func.unnest(func.string_to_array(delimited, TS_START)).alias()
|
||||
part = column(parts.name)
|
||||
part_len = func.length(part) - len(TS_STOP)
|
||||
match_pos = func.sum(part_len).over(rows=(None, -1)) + len(TS_STOP)
|
||||
match_len = func.strpos(part, TS_STOP) - 1
|
||||
return func.array(
|
||||
select([postgresql.array([match_pos, match_len])])
|
||||
.select_from(parts)
|
||||
.offset(1)
|
||||
.as_scalar()
|
||||
)
|
||||
|
||||
# When you add a new operator to this, also update zerver/lib/narrow.py
|
||||
class NarrowBuilder:
|
||||
'''
|
||||
|
@ -430,7 +452,6 @@ class NarrowBuilder:
|
|||
def _by_search_tsearch(self, query: Query, operand: str,
|
||||
maybe_negate: ConditionTransform) -> Query:
|
||||
tsquery = func.plainto_tsquery(literal("zulip.english_us_search"), literal(operand))
|
||||
ts_locs_array = func.ts_match_locs_array
|
||||
query = query.column(ts_locs_array(literal("zulip.english_us_search"),
|
||||
column("rendered_content"),
|
||||
tsquery).label("content_matches"))
|
||||
|
@ -454,9 +475,6 @@ class NarrowBuilder:
|
|||
cond = column("search_tsvector").op("@@")(tsquery)
|
||||
return query.where(maybe_negate(cond))
|
||||
|
||||
# The offsets we get from PGroonga are counted in characters
|
||||
# whereas the offsets from tsearch_extras are in bytes, so we
|
||||
# have to account for both cases in the logic below.
|
||||
def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str:
|
||||
highlight_start = '<span class="highlight">'
|
||||
highlight_stop = '</span>'
|
||||
|
@ -464,24 +482,16 @@ def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str:
|
|||
result = ''
|
||||
in_tag = False
|
||||
|
||||
text_utf8 = text.encode('utf8')
|
||||
|
||||
for loc in locs:
|
||||
(offset, length) = loc
|
||||
|
||||
# These indexes are in byte space for tsearch,
|
||||
# and they are in string space for pgroonga.
|
||||
prefix_start = pos
|
||||
prefix_end = offset
|
||||
match_start = offset
|
||||
match_end = offset + length
|
||||
|
||||
if settings.USING_PGROONGA:
|
||||
prefix = text[prefix_start:prefix_end]
|
||||
match = text[match_start:match_end]
|
||||
else:
|
||||
prefix = text_utf8[prefix_start:prefix_end].decode()
|
||||
match = text_utf8[match_start:match_end].decode()
|
||||
prefix = text[prefix_start:prefix_end]
|
||||
match = text[match_start:match_end]
|
||||
|
||||
for character in (prefix + match):
|
||||
if character == '<':
|
||||
|
@ -498,12 +508,7 @@ def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str:
|
|||
result += highlight_stop
|
||||
pos = match_end
|
||||
|
||||
if settings.USING_PGROONGA:
|
||||
final_frag = text[pos:]
|
||||
else:
|
||||
final_frag = text_utf8[pos:].decode()
|
||||
|
||||
result += final_frag
|
||||
result += text[pos:]
|
||||
return result
|
||||
|
||||
def get_search_fields(rendered_content: str, topic_name: str, content_matches: Iterable[Tuple[int, int]],
|
||||
|
|
Loading…
Reference in New Issue