fts: Dev-only, primitive prototype of text search via pg_search.

Alternative to Postgres built-in text search or pgroonga.

Very crude, early prototype:
- Not deployable in production, since this can't be installed with
simple apt install from repo, it's a bit annoying to implement.
- The dev env provisioning is also primitive and probably janky, for now
I just did the bare minimum to get things working for myself without
having to run a bunch of manual tasks.
- No highlighting of the matches yet.
- See the list of TODOs in the code for a bunch more missing stuff.
This commit is contained in:
Mateusz Mandera 2024-09-24 00:25:03 +02:00
parent 18246ebd9f
commit 9d51de3733
13 changed files with 228 additions and 2 deletions

0
paradedb/__init__.py Normal file
View File

View File

@ -0,0 +1,24 @@
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("zerver", "0001_initial"),
]
operations = [
migrations.RunSQL(
sql="""CALL paradedb.create_bm25(
index_name => 'fts_pg_search_idx',
schema_name => 'zulip',
table_name => 'zerver_message',
key_field => 'id',
text_fields => paradedb.field('rendered_content',
tokenizer => paradedb.tokenizer('icu', stemmer => 'English'),
record => 'position')
|| paradedb.field('subject', tokenizer => paradedb.tokenizer('icu', stemmer => 'English'), record => 'position')
);
""",
reverse_sql="CALL paradedb.drop_bm25( index_name => 'fts_pg_search_idx', schema_name => 'zulip');",
),
]

View File

10
scripts/lib/build-paradedb Executable file
View File

@ -0,0 +1,10 @@
#!/usr/bin/env bash
set -euxo pipefail
tmpdir="$(mktemp -d)"
trap 'rm -r "$tmpdir"' EXIT
cd "$tmpdir"
curl -fLO --retry 3 "https://github.com/paradedb/paradedb/releases/download/v0.12.2/postgresql-14-pg-search_0.12.2-1PARADEDB-jammy_amd64.deb"
dpkg -i "postgresql-14-pg-search_0.12.2-1PARADEDB-jammy_amd64.deb"

View File

@ -230,6 +230,7 @@ def install_system_deps() -> None:
run_as_root(["./scripts/lib/build-groonga"])
if BUILD_PGROONGA_FROM_SOURCE:
run_as_root(["./scripts/lib/build-pgroonga"])
run_as_root(["./scripts/lib/build-paradedb"])
def install_apt_deps(deps_to_install: list[str]) -> None:

View File

@ -12,6 +12,7 @@ FILES_WITH_LEGACY_SUBJECT = {
# This basically requires a big DB migration:
"zerver/lib/topic.py",
"zerver/lib/topic_sqlalchemy.py",
"zerver/lib/paradedb.py",
# This is for backward compatibility.
"zerver/tests/test_legacy_subject.py",
# Other migration-related changes require extreme care.

View File

@ -106,6 +106,10 @@ psql -v ON_ERROR_STOP=1 -e -h localhost "$DBNAME_BASE" "$USERNAME" <<EOF
CREATE SCHEMA zulip;
EOF
PGVERSION=$(psql --version | awk '{split($3, a, "."); print a[1]}')
echo "shared_preload_libraries = 'pg_search'" | sudo tee "/etc/postgresql/$PGVERSION/main/conf.d/zulip-dev.conf"
sudo systemctl restart "postgresql@${PGVERSION}-main"
"${ROOT_POSTGRES[@]}" -v ON_ERROR_STOP=1 -e "$DBNAME_BASE" <<EOF
CREATE EXTENSION pgroonga;
@ -116,6 +120,8 @@ SELECT unnest(
ARRAY['ALTER SYSTEM SET pgroonga.enable_wal = ''on''', 'SELECT pg_reload_conf()']
END
) \gexec
CREATE EXTENSION pg_search;
EOF
psql -v ON_ERROR_STOP=1 -e -h localhost postgres "$USERNAME" <<EOF

View File

@ -40,6 +40,7 @@ from zerver.lib.message import (
get_first_visible_message_id,
)
from zerver.lib.narrow_predicate import channel_operators, channels_operators
from zerver.lib.paradedb import search_operand_to_tantivy_query
from zerver.lib.recipient_users import recipient_for_user_profiles
from zerver.lib.sqlalchemy_utils import get_sqlalchemy_connection
from zerver.lib.streams import (
@ -757,11 +758,48 @@ class NarrowBuilder:
return query.where(maybe_negate(cond))
def by_search(self, query: Select, operand: str, maybe_negate: ConditionTransform) -> Select:
if settings.USING_PGROONGA:
if settings.USING_PARADEDB:
return self._by_search_paradedb(query, operand, maybe_negate)
elif settings.USING_PGROONGA:
return self._by_search_pgroonga(query, operand, maybe_negate)
else:
return self._by_search_tsearch(query, operand, maybe_negate)
def _by_search_paradedb(
self, query: Select, operand: str, maybe_negate: ConditionTransform
) -> Select:
# See search_operand_to_tantivy_query for details of what queries are supported.
text_search_query = search_operand_to_tantivy_query(operand)
# This makes the tantivy query be passed to Postgres as a bind parameter.
# This might interfere in the future with ParadeDB's ability to "push down"
# regular SQL conditions such as "AND realm_id = N" into the tantivy query
# when they add this smart push down functionality.
# All this means for us is that we'll have to verify if it works before
# relying on it, and if it doesn't just make sure we insert the relevant
# conditions into the tantivy query ourselves when appropriate.
#
# The benefit of passing the tantivy query in a bind parameter is that
# it's more robust against SQL injection if we have bugs in
# search_operand_to_tantivy_query.
safe_text_search_query = literal(text_search_query)
final_search_operand = func.paradedb.parse(safe_text_search_query)
# TODO: finding the matches for higlighting is not implemented yet
# https://github.com/paradedb/paradedb/issues/1778
query = query.add_columns(
literal_column("ARRAY[]::integer[]").label("content_matches"),
literal_column("ARRAY[]::integer[]").label("topic_matches"),
)
if text_search_query is None:
# The operand was not parsable or not supported. Return empty results
# from the search.
return query.where(false())
condition = literal_column("zerver_message.id").op("@@@")(final_search_operand)
return query.where(maybe_negate(condition))
def _by_search_pgroonga(
self, query: Select, operand: str, maybe_negate: ConditionTransform
) -> Select:
@ -1438,6 +1476,9 @@ def fetch_messages(
# This is a hack to tag the query we use for testing
query = query.prefix_with("/* get_messages */")
# print("----------- final query -----------")
# print(query.compile(compile_kwargs={"literal_binds": True}))
# print("-----------------------------------")
rows = list(sa_conn.execute(query).fetchall())
if client_requested_message_ids is not None:

131
zerver/lib/paradedb.py Normal file
View File

@ -0,0 +1,131 @@
import html
import re
from dataclasses import dataclass
@dataclass
class QueryAtom:
original_string: str
html_escaped_string: str
is_phrase: bool
def search_operand_to_tantivy_query(operand: str) -> str | None:
# TODO: Implement better parsing here so that we can support
# more complex queries; consider exposing some of the tantivy
# query language.
#
# A basic case that should at least be supported is search with phrase
# terms and word terms such as
# word1 "phrase with spaces" word2 "another phrase"
# For now this only supports:
# 1) single phrase search: "phrase with spaces"
# 2) words search: word1 word2 word3
# https://docs.paradedb.com/documentation/full-text/term#special-characters
special_chars = {
"'",
'"',
"+",
"^",
"`",
":",
"{",
"}",
"[",
"]",
"(",
")",
"~",
"!",
"\\",
"*",
}
# These aren't listed, but empirically seem to be clearly special symbols that need escaping
# to be treated literally.
# https://github.com/paradedb/paradedb/pull/1827
special_chars.update({"<", ">"})
# The operand may have the form "<some content>", not necessarily alphanumeric.
# That's treated as a phrase search.
match = re.match(r'^"(.*)"$', operand)
if match:
extracted = match.group(1) # Return the content inside the quotes
if not extracted:
return None
if '"' in extracted:
# It doesn't seem important to support this in phrase search
# as it might complicate the parsing logic in terms of figuring out the
# boundaries of the phrase.
return None
# Escape HTML as HTML symbols that were typed by the user in a message
# end up HTML-escaped in rendered_content. " and ' are preserved literally
# in rendered_content, so here we also need to be careful and html.escape(..., quote=False)
# to preserve them.
#
# Note: Until the TODO below is resolved however, these special characters
# are not allowed in the search anyway.
html_escaped = html.escape(extracted, quote=False)
# TODO: The exact set of characters that may need escaping and how exactly
# to escape them is still somewhat unclear. For now, just don't allow any
# special characters. Add proper escaping when possible.
if special_chars.intersection(html_escaped):
# TODO: Alternatively, we could just remove the chars. Counterpoint: phrase
# search is supposed to be very literal, so just clearly refusing to accept
# the input might be better than altering the search.
return None
query_atoms = [
QueryAtom(original_string=extracted, html_escaped_string=html_escaped, is_phrase=True)
]
else:
# The operand has the form
# word1 word2 word3
# where each word is a single word. Zulip's interpretation of this
# search is "message contains all of these words in concat(subject, rendered_content)".
#
# The tantivy query reflecting that can be formed as:
# (rendered_content:word1 OR subject:word1) AND (rendered_content:word2 OR subject:word2)
# AND (rendered_content:word3 OR subject:word3)
query_atoms = []
# Ignore special characters by stripping them out of the words.
# TODO: Consider just escaping them once we fully understand how to do that
# correctly and for which characters exactly.
for word in operand.split():
word = word.strip()
word = "".join(char for char in word if char not in special_chars)
html_escaped_word = html.escape(word, quote=False)
query_atoms.append(
QueryAtom(
original_string=word, html_escaped_string=html_escaped_word, is_phrase=False
)
)
or_conditions: list[str] = []
for query_atom in query_atoms:
# In Zulip, .rendered_content contains HTML-escaped content from the user, while .subject doesn't
# escape HTML and thus contains the original content as entered by the user.
# For that reason, QueryAtom contains both HTML escaped and original versions of the search query.
# We want to match rendered_content against the HTML escaped version, but subject against the original.
#
# E.g. if we are being queried for <b>hello</b>, we want to search for &lt;b&gt;hello&lt;/b&gt in
# rendered_content, but <b>hello</b> in subject. In practice, punctuation seems mostly ignored in our
# tantivy queries on the index with English stemming, so this is a fairly theoretical technicality;
# but might matter if we change/add tokenizers in the future.
if query_atom.is_phrase:
assert (
len(query_atoms) == 1
), "Phrase search doesn't support anything else than a single phrase in the query"
or_conditions.append(
f'(rendered_content:"{query_atom.html_escaped_string}" OR subject:"{query_atom.original_string}")'
)
else:
or_conditions.append(
f"(rendered_content:{query_atom.html_escaped_string} OR subject:{query_atom.original_string})"
)
final_query = " AND ".join(or_conditions)
return final_query

View File

@ -37,6 +37,8 @@ def get_sqlalchemy_connection() -> Iterator[Connection]:
creator=get_dj_conn,
poolclass=NonClosingPool,
pool_reset_on_return=None,
# TODO: This is for dev debugging, remove before merging.
echo=True,
)
with sqlalchemy_engine.connect().execution_options(autocommit=False) as sa_connection:
yield sa_connection

View File

@ -68,6 +68,7 @@ from .configured_settings import (
STATIC_URL,
SUBMIT_USAGE_STATISTICS,
TORNADO_PORTS,
USING_PARADEDB,
USING_PGROONGA,
ZULIP_ADMINISTRATOR,
ZULIP_SERVICE_PUSH_NOTIFICATIONS,
@ -276,7 +277,11 @@ INSTALLED_APPS = [
"two_factor",
"two_factor.plugins.phonenumber",
]
if USING_PARADEDB:
assert not USING_PGROONGA
INSTALLED_APPS += ["paradedb"]
if USING_PGROONGA:
assert not USING_PARADEDB
INSTALLED_APPS += ["pgroonga"]
INSTALLED_APPS += EXTRA_INSTALLED_APPS

View File

@ -424,6 +424,10 @@ REALM_MOBILE_REMAP_URIS: dict[str, str] = {}
# testing.
USING_PGROONGA = False
# Whether the server is using the pg_search (paradedb) extension
# for full-text search.
USING_PARADEDB = False
# How Django should send emails. Set for most contexts in settings.py, but
# available for sysadmin override in unusual cases.
EMAIL_BACKEND: str | None = None

View File

@ -99,7 +99,8 @@ TERMS_OF_SERVICE_MESSAGE: str | None = "Description of changes to the ToS!"
EMBEDDED_BOTS_ENABLED = True
SYSTEM_ONLY_REALMS: set[str] = set()
USING_PGROONGA = True
USING_PGROONGA = False
USING_PARADEDB = True
# Flush cache after migration.
POST_MIGRATION_CACHE_FLUSHING = True