search: Reimplement ts_locs_array in pure PostgreSQL.

Signed-off-by: Anders Kaseorg <anders@zulipchat.com>
This commit is contained in:
Anders Kaseorg 2019-08-28 02:06:38 -07:00
parent d2d4800b1c
commit c3e395b7d8
2 changed files with 60 additions and 25 deletions

View File

@ -340,13 +340,13 @@ class NarrowBuilderTest(ZulipTestCase):
@override_settings(USING_PGROONGA=False)
def test_add_term_using_search_operator(self) -> None:
term = dict(operator='search', operand='"french fries"')
self._do_add_term_test(term, 'WHERE (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND (search_tsvector @@ plainto_tsquery(%(param_2)s, %(param_3)s))')
self._do_add_term_test(term, 'WHERE (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND (search_tsvector @@ plainto_tsquery(%(param_4)s, %(param_5)s))')
@override_settings(USING_PGROONGA=False)
def test_add_term_using_search_operator_and_negated(
self) -> None: # NEGATED
term = dict(operator='search', operand='"french fries"', negated=True)
self._do_add_term_test(term, 'WHERE NOT (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND NOT (search_tsvector @@ plainto_tsquery(%(param_2)s, %(param_3)s))')
self._do_add_term_test(term, 'WHERE NOT (content ILIKE %(content_1)s OR subject ILIKE %(subject_1)s) AND NOT (search_tsvector @@ plainto_tsquery(%(param_4)s, %(param_5)s))')
@override_settings(USING_PGROONGA=True)
def test_add_term_using_search_operator_pgroonga(self) -> None:
@ -2749,19 +2749,49 @@ recipient_id = %(recipient_id_3)s AND upper(subject) = upper(%(param_2)s))\
def test_get_messages_with_search_queries(self) -> None:
query_ids = self.get_query_ids()
sql_template = "SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT message_id, flags, subject, rendered_content, ts_match_locs_array('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping')) AS content_matches, ts_match_locs_array('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping')) AS topic_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = {hamlet_id} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" # type: str
sql_template = """\
SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT message_id, flags, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3 \n\
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5 \n\
LIMIT ALL OFFSET 1)) AS topic_matches \n\
FROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \n\
WHERE user_profile_id = {hamlet_id} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY message_id ASC \n\
LIMIT 10) AS anon_1 ORDER BY message_id ASC\
"""
sql = sql_template.format(**query_ids)
self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9,
'narrow': '[["search", "jumping"]]'},
sql)
sql_template = "SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT id AS message_id, subject, rendered_content, ts_match_locs_array('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping')) AS content_matches, ts_match_locs_array('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping')) AS topic_matches \nFROM zerver_message \nWHERE recipient_id = {scotland_recipient} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY zerver_message.id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC"
sql_template = """\
SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT id AS message_id, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3 \n\
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', 'jumping'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5 \n\
LIMIT ALL OFFSET 1)) AS topic_matches \n\
FROM zerver_message \n\
WHERE recipient_id = {scotland_recipient} AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', 'jumping')) ORDER BY zerver_message.id ASC \n\
LIMIT 10) AS anon_1 ORDER BY message_id ASC\
"""
sql = sql_template.format(**query_ids)
self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9,
'narrow': '[["stream", "Scotland"], ["search", "jumping"]]'},
sql)
sql_template = 'SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \nFROM (SELECT message_id, flags, subject, rendered_content, ts_match_locs_array(\'zulip.english_us_search\', rendered_content, plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) AS content_matches, ts_match_locs_array(\'zulip.english_us_search\', escape_html(subject), plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) AS topic_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = {hamlet_id} AND (content ILIKE \'%jumping%\' OR subject ILIKE \'%jumping%\') AND (search_tsvector @@ plainto_tsquery(\'zulip.english_us_search\', \'"jumping" quickly\')) ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC'
sql_template = """\
SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.topic_matches \n\
FROM (SELECT message_id, flags, subject, rendered_content, array((SELECT ARRAY[sum(length(anon_3) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_3, '</ts-match>') - 1] AS anon_2 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', rendered_content, plainto_tsquery('zulip.english_us_search', '"jumping" quickly'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_3 \n\
LIMIT ALL OFFSET 1)) AS content_matches, array((SELECT ARRAY[sum(length(anon_5) - 11) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND 1 PRECEDING) + 11, strpos(anon_5, '</ts-match>') - 1] AS anon_4 \n\
FROM unnest(string_to_array(ts_headline('zulip.english_us_search', escape_html(subject), plainto_tsquery('zulip.english_us_search', '"jumping" quickly'), 'HighlightAll = TRUE, StartSel = <ts-match>, StopSel = </ts-match>'), '<ts-match>')) AS anon_5 \n\
LIMIT ALL OFFSET 1)) AS topic_matches \n\
FROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \n\
WHERE user_profile_id = {hamlet_id} AND (content ILIKE '%jumping%' OR subject ILIKE '%jumping%') AND (search_tsvector @@ plainto_tsquery('zulip.english_us_search', '"jumping" quickly')) ORDER BY message_id ASC \n\
LIMIT 10) AS anon_1 ORDER BY message_id ASC\
"""
sql = sql_template.format(**query_ids)
self.common_check_get_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 9,
'narrow': '[["search", "\\"jumping\\" quickly"]]'},

View File

@ -56,6 +56,7 @@ from zerver.models import Message, UserProfile, Stream, Subscription, Client,\
get_user_by_id_in_realm_including_cross_realm, get_stream_recipient
from sqlalchemy import func
from sqlalchemy.dialects import postgresql
from sqlalchemy.sql import select, join, column, literal_column, literal, and_, \
or_, not_, union_all, alias, Selectable, ColumnElement, table
@ -86,6 +87,27 @@ ConditionTransform = Any
OptionalNarrowListT = Optional[List[Dict[str, Any]]]
# These delimiters will not appear in rendered messages or HTML-escaped topics.
TS_START = "<ts-match>"
TS_STOP = "</ts-match>"
def ts_locs_array(
config: ColumnElement, text: ColumnElement, tsquery: ColumnElement
) -> ColumnElement:
options = "HighlightAll = TRUE, StartSel = %s, StopSel = %s" % (TS_START, TS_STOP)
delimited = func.ts_headline(config, text, tsquery, options)
parts = func.unnest(func.string_to_array(delimited, TS_START)).alias()
part = column(parts.name)
part_len = func.length(part) - len(TS_STOP)
match_pos = func.sum(part_len).over(rows=(None, -1)) + len(TS_STOP)
match_len = func.strpos(part, TS_STOP) - 1
return func.array(
select([postgresql.array([match_pos, match_len])])
.select_from(parts)
.offset(1)
.as_scalar()
)
# When you add a new operator to this, also update zerver/lib/narrow.py
class NarrowBuilder:
'''
@ -430,7 +452,6 @@ class NarrowBuilder:
def _by_search_tsearch(self, query: Query, operand: str,
maybe_negate: ConditionTransform) -> Query:
tsquery = func.plainto_tsquery(literal("zulip.english_us_search"), literal(operand))
ts_locs_array = func.ts_match_locs_array
query = query.column(ts_locs_array(literal("zulip.english_us_search"),
column("rendered_content"),
tsquery).label("content_matches"))
@ -454,9 +475,6 @@ class NarrowBuilder:
cond = column("search_tsvector").op("@@")(tsquery)
return query.where(maybe_negate(cond))
# The offsets we get from PGroonga are counted in characters
# whereas the offsets from tsearch_extras are in bytes, so we
# have to account for both cases in the logic below.
def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str:
highlight_start = '<span class="highlight">'
highlight_stop = '</span>'
@ -464,24 +482,16 @@ def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str:
result = ''
in_tag = False
text_utf8 = text.encode('utf8')
for loc in locs:
(offset, length) = loc
# These indexes are in byte space for tsearch,
# and they are in string space for pgroonga.
prefix_start = pos
prefix_end = offset
match_start = offset
match_end = offset + length
if settings.USING_PGROONGA:
prefix = text[prefix_start:prefix_end]
match = text[match_start:match_end]
else:
prefix = text_utf8[prefix_start:prefix_end].decode()
match = text_utf8[match_start:match_end].decode()
prefix = text[prefix_start:prefix_end]
match = text[match_start:match_end]
for character in (prefix + match):
if character == '<':
@ -498,12 +508,7 @@ def highlight_string(text: str, locs: Iterable[Tuple[int, int]]) -> str:
result += highlight_stop
pos = match_end
if settings.USING_PGROONGA:
final_frag = text[pos:]
else:
final_frag = text_utf8[pos:].decode()
result += final_frag
result += text[pos:]
return result
def get_search_fields(rendered_content: str, topic_name: str, content_matches: Iterable[Tuple[int, int]],