fts: Dev-only, primitive prototype of text search via pg_search.

Alternative to Postgres built-in text search or pgroonga. Very crude, early prototype: - Not deployable in production, since this can't be installed with simple apt install from repo, it's a bit annoying to implement. - The dev env provisioning is also primitive and probably janky, for now I just did the bare minimum to get things working for myself without having to run a bunch of manual tasks. - No highlighting of the matches yet. - See the list of TODOs in the code for a bunch more missing stuff.
2024-09-24 00:25:03 +02:00 · 2024-09-24 00:25:03 +02:00 · 9d51de3733
parent 18246ebd9f
commit 9d51de3733
13 changed files with 228 additions and 2 deletions
--- a/paradedb/init.py
+++ b/paradedb/init.py
--- a/paradedb/migrations/0001_enable.py
+++ b/paradedb/migrations/0001_enable.py
@ -0,0 +1,24 @@
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+    dependencies = [
+        ("zerver", "0001_initial"),
+    ]
+
+    operations = [
+        migrations.RunSQL(
+            sql="""CALL paradedb.create_bm25(
+                    index_name => 'fts_pg_search_idx',
+                    schema_name => 'zulip',
+                    table_name => 'zerver_message',
+                    key_field => 'id',
+                    text_fields => paradedb.field('rendered_content',
+                                                  tokenizer => paradedb.tokenizer('icu', stemmer => 'English'),
+                                                  record => 'position')
+                                   || paradedb.field('subject', tokenizer => paradedb.tokenizer('icu', stemmer => 'English'), record => 'position')
+                );
+                """,
+            reverse_sql="CALL paradedb.drop_bm25( index_name => 'fts_pg_search_idx', schema_name => 'zulip');",
+        ),
+    ]
--- a/paradedb/migrations/init.py
+++ b/paradedb/migrations/init.py
--- a/scripts/lib/build-paradedb
+++ b/scripts/lib/build-paradedb
@ -0,0 +1,10 @@
+#!/usr/bin/env bash
+set -euxo pipefail
+
+tmpdir="$(mktemp -d)"
+trap 'rm -r "$tmpdir"' EXIT
+cd "$tmpdir"
+
+curl -fLO --retry 3 "https://github.com/paradedb/paradedb/releases/download/v0.12.2/postgresql-14-pg-search_0.12.2-1PARADEDB-jammy_amd64.deb"
+
+dpkg -i "postgresql-14-pg-search_0.12.2-1PARADEDB-jammy_amd64.deb"
--- a/tools/lib/provision.py
+++ b/tools/lib/provision.py
@ -230,6 +230,7 @@ def install_system_deps() -> None:
        run_as_root(["./scripts/lib/build-groonga"])
    if BUILD_PGROONGA_FROM_SOURCE:
        run_as_root(["./scripts/lib/build-pgroonga"])
+    run_as_root(["./scripts/lib/build-paradedb"])


 def install_apt_deps(deps_to_install: list[str]) -> None:
--- a/tools/linter_lib/custom_check.py
+++ b/tools/linter_lib/custom_check.py
@ -12,6 +12,7 @@ FILES_WITH_LEGACY_SUBJECT = {
    # This basically requires a big DB migration:
    "zerver/lib/topic.py",
    "zerver/lib/topic_sqlalchemy.py",
+    "zerver/lib/paradedb.py",
    # This is for backward compatibility.
    "zerver/tests/test_legacy_subject.py",
    # Other migration-related changes require extreme care.
--- a/tools/setup/postgresql-init-dev-db
+++ b/tools/setup/postgresql-init-dev-db
@ -106,6 +106,10 @@ psql -v ON_ERROR_STOP=1 -e -h localhost "$DBNAME_BASE" "$USERNAME" <<EOF
 CREATE SCHEMA zulip;
 EOF

+PGVERSION=$(psql --version | awk '{split($3, a, "."); print a[1]}')
+echo "shared_preload_libraries = 'pg_search'" | sudo tee "/etc/postgresql/$PGVERSION/main/conf.d/zulip-dev.conf"
+sudo systemctl restart "postgresql@${PGVERSION}-main"
+
 "${ROOT_POSTGRES[@]}" -v ON_ERROR_STOP=1 -e "$DBNAME_BASE" <<EOF
 CREATE EXTENSION pgroonga;

@ -116,6 +120,8 @@ SELECT unnest(
        ARRAY['ALTER SYSTEM SET pgroonga.enable_wal = ''on''', 'SELECT pg_reload_conf()']
    END
 ) \gexec
+
+CREATE EXTENSION pg_search;
 EOF

 psql -v ON_ERROR_STOP=1 -e -h localhost postgres "$USERNAME" <<EOF
--- a/zerver/lib/narrow.py
+++ b/zerver/lib/narrow.py
@ -40,6 +40,7 @@ from zerver.lib.message import (
    get_first_visible_message_id,
 )
 from zerver.lib.narrow_predicate import channel_operators, channels_operators
+from zerver.lib.paradedb import search_operand_to_tantivy_query
 from zerver.lib.recipient_users import recipient_for_user_profiles
 from zerver.lib.sqlalchemy_utils import get_sqlalchemy_connection
 from zerver.lib.streams import (
@ -757,11 +758,48 @@ class NarrowBuilder:
        return query.where(maybe_negate(cond))

    def by_search(self, query: Select, operand: str, maybe_negate: ConditionTransform) -> Select:
-        if settings.USING_PGROONGA:
+        if settings.USING_PARADEDB:
+            return self._by_search_paradedb(query, operand, maybe_negate)
+        elif settings.USING_PGROONGA:
            return self._by_search_pgroonga(query, operand, maybe_negate)
        else:
            return self._by_search_tsearch(query, operand, maybe_negate)

+    def _by_search_paradedb(
+        self, query: Select, operand: str, maybe_negate: ConditionTransform
+    ) -> Select:
+        # See search_operand_to_tantivy_query for details of what queries are supported.
+        text_search_query = search_operand_to_tantivy_query(operand)
+
+        # This makes the tantivy query be passed to Postgres as a bind parameter.
+        # This might interfere in the future with ParadeDB's ability to "push down"
+        # regular SQL conditions such as "AND realm_id = N" into the tantivy query
+        # when they add this smart push down functionality.
+        # All this means for us is that we'll have to verify if it works before
+        # relying on it, and if it doesn't just make sure we insert the relevant
+        # conditions into the tantivy query ourselves when appropriate.
+        #
+        # The benefit of passing the tantivy query in a bind parameter is that
+        # it's more robust against SQL injection if we have bugs in
+        # search_operand_to_tantivy_query.
+        safe_text_search_query = literal(text_search_query)
+        final_search_operand = func.paradedb.parse(safe_text_search_query)
+
+        # TODO: finding the matches for higlighting is not implemented yet
+        # https://github.com/paradedb/paradedb/issues/1778
+        query = query.add_columns(
+            literal_column("ARRAY[]::integer[]").label("content_matches"),
+            literal_column("ARRAY[]::integer[]").label("topic_matches"),
+        )
+
+        if text_search_query is None:
+            # The operand was not parsable or not supported. Return empty results
+            # from the search.
+            return query.where(false())
+
+        condition = literal_column("zerver_message.id").op("@@@")(final_search_operand)
+        return query.where(maybe_negate(condition))
+
    def _by_search_pgroonga(
        self, query: Select, operand: str, maybe_negate: ConditionTransform
    ) -> Select:
@ -1438,6 +1476,9 @@ def fetch_messages(

        # This is a hack to tag the query we use for testing
        query = query.prefix_with("/* get_messages */")
+        # print("----------- final query -----------")
+        # print(query.compile(compile_kwargs={"literal_binds": True}))
+        # print("-----------------------------------")
        rows = list(sa_conn.execute(query).fetchall())

    if client_requested_message_ids is not None:
--- a/zerver/lib/paradedb.py
+++ b/zerver/lib/paradedb.py
@ -0,0 +1,131 @@
+import html
+import re
+from dataclasses import dataclass
+
+
+@dataclass
+class QueryAtom:
+    original_string: str
+    html_escaped_string: str
+    is_phrase: bool
+
+
+def search_operand_to_tantivy_query(operand: str) -> str | None:
+    # TODO: Implement better parsing here so that we can support
+    # more complex queries; consider exposing some of the tantivy
+    # query language.
+    #
+    # A basic case that should at least be supported is search with phrase
+    # terms and word terms such as
+    # word1 "phrase with spaces" word2 "another phrase"
+    # For now this only supports:
+    # 1) single phrase search: "phrase with spaces"
+    # 2) words search: word1 word2 word3
+
+    # https://docs.paradedb.com/documentation/full-text/term#special-characters
+    special_chars = {
+        "'",
+        '"',
+        "+",
+        "^",
+        "`",
+        ":",
+        "{",
+        "}",
+        "[",
+        "]",
+        "(",
+        ")",
+        "~",
+        "!",
+        "\\",
+        "*",
+    }
+    # These aren't listed, but empirically seem to be clearly special symbols that need escaping
+    # to be treated literally.
+    # https://github.com/paradedb/paradedb/pull/1827
+    special_chars.update({"<", ">"})
+
+    # The operand may have the form "<some content>", not necessarily alphanumeric.
+    # That's treated as a phrase search.
+    match = re.match(r'^"(.*)"$', operand)
+    if match:
+        extracted = match.group(1)  # Return the content inside the quotes
+        if not extracted:
+            return None
+        if '"' in extracted:
+            # It doesn't seem important to support this in phrase search
+            # as it might complicate the parsing logic in terms of figuring out the
+            # boundaries of the phrase.
+            return None
+
+        # Escape HTML as HTML symbols that were typed by the user in a message
+        # end up HTML-escaped in rendered_content. " and ' are preserved literally
+        # in rendered_content, so here we also need to be careful and html.escape(..., quote=False)
+        # to preserve them.
+        #
+        # Note: Until the TODO below is resolved however, these special characters
+        # are not allowed in the search anyway.
+        html_escaped = html.escape(extracted, quote=False)
+
+        # TODO: The exact set of characters that may need escaping and how exactly
+        # to escape them is still somewhat unclear. For now, just don't allow any
+        # special characters. Add proper escaping when possible.
+        if special_chars.intersection(html_escaped):
+            # TODO: Alternatively, we could just remove the chars. Counterpoint: phrase
+            # search is supposed to be very literal, so just clearly refusing to accept
+            # the input might be better than altering the search.
+            return None
+
+        query_atoms = [
+            QueryAtom(original_string=extracted, html_escaped_string=html_escaped, is_phrase=True)
+        ]
+    else:
+        # The operand has the form
+        # word1 word2 word3
+        # where each word is a single word. Zulip's interpretation of this
+        # search is "message contains all of these words in concat(subject, rendered_content)".
+        #
+        # The tantivy query reflecting that can be formed as:
+        # (rendered_content:word1 OR subject:word1) AND (rendered_content:word2 OR subject:word2)
+        # AND (rendered_content:word3 OR subject:word3)
+        query_atoms = []
+
+        # Ignore special characters by stripping them out of the words.
+        # TODO: Consider just escaping them once we fully understand how to do that
+        # correctly and for which characters exactly.
+        for word in operand.split():
+            word = word.strip()
+            word = "".join(char for char in word if char not in special_chars)
+            html_escaped_word = html.escape(word, quote=False)
+            query_atoms.append(
+                QueryAtom(
+                    original_string=word, html_escaped_string=html_escaped_word, is_phrase=False
+                )
+            )
+
+    or_conditions: list[str] = []
+    for query_atom in query_atoms:
+        # In Zulip, .rendered_content contains HTML-escaped content from the user, while .subject doesn't
+        # escape HTML and thus contains the original content as entered by the user.
+        # For that reason, QueryAtom contains both HTML escaped and original versions of the search query.
+        # We want to match rendered_content against the HTML escaped version, but subject against the original.
+        #
+        # E.g. if we are being queried for <b>hello</b>, we want to search for &lt;b&gt;hello&lt;/b&gt in
+        # rendered_content, but <b>hello</b> in subject. In practice, punctuation seems mostly ignored in our
+        # tantivy queries on the index with English stemming, so this is a fairly theoretical technicality;
+        # but might matter if we change/add tokenizers in the future.
+        if query_atom.is_phrase:
+            assert (
+                len(query_atoms) == 1
+            ), "Phrase search doesn't support anything else than a single phrase in the query"
+            or_conditions.append(
+                f'(rendered_content:"{query_atom.html_escaped_string}" OR subject:"{query_atom.original_string}")'
+            )
+        else:
+            or_conditions.append(
+                f"(rendered_content:{query_atom.html_escaped_string} OR subject:{query_atom.original_string})"
+            )
+
+    final_query = " AND ".join(or_conditions)
+    return final_query
--- a/zerver/lib/sqlalchemy_utils.py
+++ b/zerver/lib/sqlalchemy_utils.py
@ -37,6 +37,8 @@ def get_sqlalchemy_connection() -> Iterator[Connection]:
            creator=get_dj_conn,
            poolclass=NonClosingPool,
            pool_reset_on_return=None,
+            # TODO: This is for dev debugging, remove before merging.
+            echo=True,
        )
    with sqlalchemy_engine.connect().execution_options(autocommit=False) as sa_connection:
        yield sa_connection
--- a/zproject/computed_settings.py
+++ b/zproject/computed_settings.py
@ -68,6 +68,7 @@ from .configured_settings import (
    STATIC_URL,
    SUBMIT_USAGE_STATISTICS,
    TORNADO_PORTS,
+    USING_PARADEDB,
    USING_PGROONGA,
    ZULIP_ADMINISTRATOR,
    ZULIP_SERVICE_PUSH_NOTIFICATIONS,
@ -276,7 +277,11 @@ INSTALLED_APPS = [
    "two_factor",
    "two_factor.plugins.phonenumber",
 ]
+if USING_PARADEDB:
+    assert not USING_PGROONGA
+    INSTALLED_APPS += ["paradedb"]
 if USING_PGROONGA:
+    assert not USING_PARADEDB
    INSTALLED_APPS += ["pgroonga"]
 INSTALLED_APPS += EXTRA_INSTALLED_APPS

--- a/zproject/default_settings.py
+++ b/zproject/default_settings.py
@ -424,6 +424,10 @@ REALM_MOBILE_REMAP_URIS: dict[str, str] = {}
 # testing.
 USING_PGROONGA = False

+# Whether the server is using the pg_search (paradedb) extension
+# for full-text search.
+USING_PARADEDB = False
+
 # How Django should send emails.  Set for most contexts in settings.py, but
 # available for sysadmin override in unusual cases.
 EMAIL_BACKEND: str | None = None
--- a/zproject/dev_settings.py
+++ b/zproject/dev_settings.py
@ -99,7 +99,8 @@ TERMS_OF_SERVICE_MESSAGE: str | None = "Description of changes to the ToS!"
 EMBEDDED_BOTS_ENABLED = True

 SYSTEM_ONLY_REALMS: set[str] = set()
-USING_PGROONGA = True
+USING_PGROONGA = False
+USING_PARADEDB = True
 # Flush cache after migration.
 POST_MIGRATION_CACHE_FLUSHING = True