From 9d51de37339f9a2bed6b5bbd7d37846e69b1afb2 Mon Sep 17 00:00:00 2001 From: Mateusz Mandera Date: Tue, 24 Sep 2024 00:25:03 +0200 Subject: [PATCH] fts: Dev-only, primitive prototype of text search via pg_search. Alternative to Postgres built-in text search or pgroonga. Very crude, early prototype: - Not deployable in production, since this can't be installed with simple apt install from repo, it's a bit annoying to implement. - The dev env provisioning is also primitive and probably janky, for now I just did the bare minimum to get things working for myself without having to run a bunch of manual tasks. - No highlighting of the matches yet. - See the list of TODOs in the code for a bunch more missing stuff. --- paradedb/__init__.py | 0 paradedb/migrations/0001_enable.py | 24 ++++++ paradedb/migrations/__init__.py | 0 scripts/lib/build-paradedb | 10 +++ tools/lib/provision.py | 1 + tools/linter_lib/custom_check.py | 1 + tools/setup/postgresql-init-dev-db | 6 ++ zerver/lib/narrow.py | 43 +++++++++- zerver/lib/paradedb.py | 131 +++++++++++++++++++++++++++++ zerver/lib/sqlalchemy_utils.py | 2 + zproject/computed_settings.py | 5 ++ zproject/default_settings.py | 4 + zproject/dev_settings.py | 3 +- 13 files changed, 228 insertions(+), 2 deletions(-) create mode 100644 paradedb/__init__.py create mode 100644 paradedb/migrations/0001_enable.py create mode 100644 paradedb/migrations/__init__.py create mode 100755 scripts/lib/build-paradedb create mode 100644 zerver/lib/paradedb.py diff --git a/paradedb/__init__.py b/paradedb/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/paradedb/migrations/0001_enable.py b/paradedb/migrations/0001_enable.py new file mode 100644 index 0000000000..d175fc4108 --- /dev/null +++ b/paradedb/migrations/0001_enable.py @@ -0,0 +1,24 @@ +from django.db import migrations + + +class Migration(migrations.Migration): + dependencies = [ + ("zerver", "0001_initial"), + ] + + operations = [ + migrations.RunSQL( + sql="""CALL paradedb.create_bm25( + index_name => 'fts_pg_search_idx', + schema_name => 'zulip', + table_name => 'zerver_message', + key_field => 'id', + text_fields => paradedb.field('rendered_content', + tokenizer => paradedb.tokenizer('icu', stemmer => 'English'), + record => 'position') + || paradedb.field('subject', tokenizer => paradedb.tokenizer('icu', stemmer => 'English'), record => 'position') + ); + """, + reverse_sql="CALL paradedb.drop_bm25( index_name => 'fts_pg_search_idx', schema_name => 'zulip');", + ), + ] diff --git a/paradedb/migrations/__init__.py b/paradedb/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/scripts/lib/build-paradedb b/scripts/lib/build-paradedb new file mode 100755 index 0000000000..9e900ae2bf --- /dev/null +++ b/scripts/lib/build-paradedb @@ -0,0 +1,10 @@ +#!/usr/bin/env bash +set -euxo pipefail + +tmpdir="$(mktemp -d)" +trap 'rm -r "$tmpdir"' EXIT +cd "$tmpdir" + +curl -fLO --retry 3 "https://github.com/paradedb/paradedb/releases/download/v0.12.2/postgresql-14-pg-search_0.12.2-1PARADEDB-jammy_amd64.deb" + +dpkg -i "postgresql-14-pg-search_0.12.2-1PARADEDB-jammy_amd64.deb" diff --git a/tools/lib/provision.py b/tools/lib/provision.py index 7f48bb382e..7d3b7becb6 100755 --- a/tools/lib/provision.py +++ b/tools/lib/provision.py @@ -230,6 +230,7 @@ def install_system_deps() -> None: run_as_root(["./scripts/lib/build-groonga"]) if BUILD_PGROONGA_FROM_SOURCE: run_as_root(["./scripts/lib/build-pgroonga"]) + run_as_root(["./scripts/lib/build-paradedb"]) def install_apt_deps(deps_to_install: list[str]) -> None: diff --git a/tools/linter_lib/custom_check.py b/tools/linter_lib/custom_check.py index b8ba0a91dc..f0dc366d1b 100644 --- a/tools/linter_lib/custom_check.py +++ b/tools/linter_lib/custom_check.py @@ -12,6 +12,7 @@ FILES_WITH_LEGACY_SUBJECT = { # This basically requires a big DB migration: "zerver/lib/topic.py", "zerver/lib/topic_sqlalchemy.py", + "zerver/lib/paradedb.py", # This is for backward compatibility. "zerver/tests/test_legacy_subject.py", # Other migration-related changes require extreme care. diff --git a/tools/setup/postgresql-init-dev-db b/tools/setup/postgresql-init-dev-db index c3500da1c6..354ecf54eb 100755 --- a/tools/setup/postgresql-init-dev-db +++ b/tools/setup/postgresql-init-dev-db @@ -106,6 +106,10 @@ psql -v ON_ERROR_STOP=1 -e -h localhost "$DBNAME_BASE" "$USERNAME" < Select: - if settings.USING_PGROONGA: + if settings.USING_PARADEDB: + return self._by_search_paradedb(query, operand, maybe_negate) + elif settings.USING_PGROONGA: return self._by_search_pgroonga(query, operand, maybe_negate) else: return self._by_search_tsearch(query, operand, maybe_negate) + def _by_search_paradedb( + self, query: Select, operand: str, maybe_negate: ConditionTransform + ) -> Select: + # See search_operand_to_tantivy_query for details of what queries are supported. + text_search_query = search_operand_to_tantivy_query(operand) + + # This makes the tantivy query be passed to Postgres as a bind parameter. + # This might interfere in the future with ParadeDB's ability to "push down" + # regular SQL conditions such as "AND realm_id = N" into the tantivy query + # when they add this smart push down functionality. + # All this means for us is that we'll have to verify if it works before + # relying on it, and if it doesn't just make sure we insert the relevant + # conditions into the tantivy query ourselves when appropriate. + # + # The benefit of passing the tantivy query in a bind parameter is that + # it's more robust against SQL injection if we have bugs in + # search_operand_to_tantivy_query. + safe_text_search_query = literal(text_search_query) + final_search_operand = func.paradedb.parse(safe_text_search_query) + + # TODO: finding the matches for higlighting is not implemented yet + # https://github.com/paradedb/paradedb/issues/1778 + query = query.add_columns( + literal_column("ARRAY[]::integer[]").label("content_matches"), + literal_column("ARRAY[]::integer[]").label("topic_matches"), + ) + + if text_search_query is None: + # The operand was not parsable or not supported. Return empty results + # from the search. + return query.where(false()) + + condition = literal_column("zerver_message.id").op("@@@")(final_search_operand) + return query.where(maybe_negate(condition)) + def _by_search_pgroonga( self, query: Select, operand: str, maybe_negate: ConditionTransform ) -> Select: @@ -1438,6 +1476,9 @@ def fetch_messages( # This is a hack to tag the query we use for testing query = query.prefix_with("/* get_messages */") + # print("----------- final query -----------") + # print(query.compile(compile_kwargs={"literal_binds": True})) + # print("-----------------------------------") rows = list(sa_conn.execute(query).fetchall()) if client_requested_message_ids is not None: diff --git a/zerver/lib/paradedb.py b/zerver/lib/paradedb.py new file mode 100644 index 0000000000..f71e873697 --- /dev/null +++ b/zerver/lib/paradedb.py @@ -0,0 +1,131 @@ +import html +import re +from dataclasses import dataclass + + +@dataclass +class QueryAtom: + original_string: str + html_escaped_string: str + is_phrase: bool + + +def search_operand_to_tantivy_query(operand: str) -> str | None: + # TODO: Implement better parsing here so that we can support + # more complex queries; consider exposing some of the tantivy + # query language. + # + # A basic case that should at least be supported is search with phrase + # terms and word terms such as + # word1 "phrase with spaces" word2 "another phrase" + # For now this only supports: + # 1) single phrase search: "phrase with spaces" + # 2) words search: word1 word2 word3 + + # https://docs.paradedb.com/documentation/full-text/term#special-characters + special_chars = { + "'", + '"', + "+", + "^", + "`", + ":", + "{", + "}", + "[", + "]", + "(", + ")", + "~", + "!", + "\\", + "*", + } + # These aren't listed, but empirically seem to be clearly special symbols that need escaping + # to be treated literally. + # https://github.com/paradedb/paradedb/pull/1827 + special_chars.update({"<", ">"}) + + # The operand may have the form "", not necessarily alphanumeric. + # That's treated as a phrase search. + match = re.match(r'^"(.*)"$', operand) + if match: + extracted = match.group(1) # Return the content inside the quotes + if not extracted: + return None + if '"' in extracted: + # It doesn't seem important to support this in phrase search + # as it might complicate the parsing logic in terms of figuring out the + # boundaries of the phrase. + return None + + # Escape HTML as HTML symbols that were typed by the user in a message + # end up HTML-escaped in rendered_content. " and ' are preserved literally + # in rendered_content, so here we also need to be careful and html.escape(..., quote=False) + # to preserve them. + # + # Note: Until the TODO below is resolved however, these special characters + # are not allowed in the search anyway. + html_escaped = html.escape(extracted, quote=False) + + # TODO: The exact set of characters that may need escaping and how exactly + # to escape them is still somewhat unclear. For now, just don't allow any + # special characters. Add proper escaping when possible. + if special_chars.intersection(html_escaped): + # TODO: Alternatively, we could just remove the chars. Counterpoint: phrase + # search is supposed to be very literal, so just clearly refusing to accept + # the input might be better than altering the search. + return None + + query_atoms = [ + QueryAtom(original_string=extracted, html_escaped_string=html_escaped, is_phrase=True) + ] + else: + # The operand has the form + # word1 word2 word3 + # where each word is a single word. Zulip's interpretation of this + # search is "message contains all of these words in concat(subject, rendered_content)". + # + # The tantivy query reflecting that can be formed as: + # (rendered_content:word1 OR subject:word1) AND (rendered_content:word2 OR subject:word2) + # AND (rendered_content:word3 OR subject:word3) + query_atoms = [] + + # Ignore special characters by stripping them out of the words. + # TODO: Consider just escaping them once we fully understand how to do that + # correctly and for which characters exactly. + for word in operand.split(): + word = word.strip() + word = "".join(char for char in word if char not in special_chars) + html_escaped_word = html.escape(word, quote=False) + query_atoms.append( + QueryAtom( + original_string=word, html_escaped_string=html_escaped_word, is_phrase=False + ) + ) + + or_conditions: list[str] = [] + for query_atom in query_atoms: + # In Zulip, .rendered_content contains HTML-escaped content from the user, while .subject doesn't + # escape HTML and thus contains the original content as entered by the user. + # For that reason, QueryAtom contains both HTML escaped and original versions of the search query. + # We want to match rendered_content against the HTML escaped version, but subject against the original. + # + # E.g. if we are being queried for hello, we want to search for <b>hello</b> in + # rendered_content, but hello in subject. In practice, punctuation seems mostly ignored in our + # tantivy queries on the index with English stemming, so this is a fairly theoretical technicality; + # but might matter if we change/add tokenizers in the future. + if query_atom.is_phrase: + assert ( + len(query_atoms) == 1 + ), "Phrase search doesn't support anything else than a single phrase in the query" + or_conditions.append( + f'(rendered_content:"{query_atom.html_escaped_string}" OR subject:"{query_atom.original_string}")' + ) + else: + or_conditions.append( + f"(rendered_content:{query_atom.html_escaped_string} OR subject:{query_atom.original_string})" + ) + + final_query = " AND ".join(or_conditions) + return final_query diff --git a/zerver/lib/sqlalchemy_utils.py b/zerver/lib/sqlalchemy_utils.py index ace89e353c..e5daeb1b37 100644 --- a/zerver/lib/sqlalchemy_utils.py +++ b/zerver/lib/sqlalchemy_utils.py @@ -37,6 +37,8 @@ def get_sqlalchemy_connection() -> Iterator[Connection]: creator=get_dj_conn, poolclass=NonClosingPool, pool_reset_on_return=None, + # TODO: This is for dev debugging, remove before merging. + echo=True, ) with sqlalchemy_engine.connect().execution_options(autocommit=False) as sa_connection: yield sa_connection diff --git a/zproject/computed_settings.py b/zproject/computed_settings.py index e66b949b08..949ada05d5 100644 --- a/zproject/computed_settings.py +++ b/zproject/computed_settings.py @@ -68,6 +68,7 @@ from .configured_settings import ( STATIC_URL, SUBMIT_USAGE_STATISTICS, TORNADO_PORTS, + USING_PARADEDB, USING_PGROONGA, ZULIP_ADMINISTRATOR, ZULIP_SERVICE_PUSH_NOTIFICATIONS, @@ -276,7 +277,11 @@ INSTALLED_APPS = [ "two_factor", "two_factor.plugins.phonenumber", ] +if USING_PARADEDB: + assert not USING_PGROONGA + INSTALLED_APPS += ["paradedb"] if USING_PGROONGA: + assert not USING_PARADEDB INSTALLED_APPS += ["pgroonga"] INSTALLED_APPS += EXTRA_INSTALLED_APPS diff --git a/zproject/default_settings.py b/zproject/default_settings.py index 0747603ba1..42230f60bc 100644 --- a/zproject/default_settings.py +++ b/zproject/default_settings.py @@ -424,6 +424,10 @@ REALM_MOBILE_REMAP_URIS: dict[str, str] = {} # testing. USING_PGROONGA = False +# Whether the server is using the pg_search (paradedb) extension +# for full-text search. +USING_PARADEDB = False + # How Django should send emails. Set for most contexts in settings.py, but # available for sysadmin override in unusual cases. EMAIL_BACKEND: str | None = None diff --git a/zproject/dev_settings.py b/zproject/dev_settings.py index 20f7807a38..2aa2e5f3d7 100644 --- a/zproject/dev_settings.py +++ b/zproject/dev_settings.py @@ -99,7 +99,8 @@ TERMS_OF_SERVICE_MESSAGE: str | None = "Description of changes to the ToS!" EMBEDDED_BOTS_ENABLED = True SYSTEM_ONLY_REALMS: set[str] = set() -USING_PGROONGA = True +USING_PGROONGA = False +USING_PARADEDB = True # Flush cache after migration. POST_MIGRATION_CACHE_FLUSHING = True