mirror of https://github.com/zulip/zulip.git
fts: Dev-only, primitive prototype of text search via pg_search.
Alternative to Postgres built-in text search or pgroonga. Very crude, early prototype: - Not deployable in production, since this can't be installed with simple apt install from repo, it's a bit annoying to implement. - The dev env provisioning is also primitive and probably janky, for now I just did the bare minimum to get things working for myself without having to run a bunch of manual tasks. - No highlighting of the matches yet. - See the list of TODOs in the code for a bunch more missing stuff.
This commit is contained in:
parent
18246ebd9f
commit
9d51de3733
|
@ -0,0 +1,24 @@
|
|||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
dependencies = [
|
||||
("zerver", "0001_initial"),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunSQL(
|
||||
sql="""CALL paradedb.create_bm25(
|
||||
index_name => 'fts_pg_search_idx',
|
||||
schema_name => 'zulip',
|
||||
table_name => 'zerver_message',
|
||||
key_field => 'id',
|
||||
text_fields => paradedb.field('rendered_content',
|
||||
tokenizer => paradedb.tokenizer('icu', stemmer => 'English'),
|
||||
record => 'position')
|
||||
|| paradedb.field('subject', tokenizer => paradedb.tokenizer('icu', stemmer => 'English'), record => 'position')
|
||||
);
|
||||
""",
|
||||
reverse_sql="CALL paradedb.drop_bm25( index_name => 'fts_pg_search_idx', schema_name => 'zulip');",
|
||||
),
|
||||
]
|
|
@ -0,0 +1,10 @@
|
|||
#!/usr/bin/env bash
|
||||
set -euxo pipefail
|
||||
|
||||
tmpdir="$(mktemp -d)"
|
||||
trap 'rm -r "$tmpdir"' EXIT
|
||||
cd "$tmpdir"
|
||||
|
||||
curl -fLO --retry 3 "https://github.com/paradedb/paradedb/releases/download/v0.12.2/postgresql-14-pg-search_0.12.2-1PARADEDB-jammy_amd64.deb"
|
||||
|
||||
dpkg -i "postgresql-14-pg-search_0.12.2-1PARADEDB-jammy_amd64.deb"
|
|
@ -230,6 +230,7 @@ def install_system_deps() -> None:
|
|||
run_as_root(["./scripts/lib/build-groonga"])
|
||||
if BUILD_PGROONGA_FROM_SOURCE:
|
||||
run_as_root(["./scripts/lib/build-pgroonga"])
|
||||
run_as_root(["./scripts/lib/build-paradedb"])
|
||||
|
||||
|
||||
def install_apt_deps(deps_to_install: list[str]) -> None:
|
||||
|
|
|
@ -12,6 +12,7 @@ FILES_WITH_LEGACY_SUBJECT = {
|
|||
# This basically requires a big DB migration:
|
||||
"zerver/lib/topic.py",
|
||||
"zerver/lib/topic_sqlalchemy.py",
|
||||
"zerver/lib/paradedb.py",
|
||||
# This is for backward compatibility.
|
||||
"zerver/tests/test_legacy_subject.py",
|
||||
# Other migration-related changes require extreme care.
|
||||
|
|
|
@ -106,6 +106,10 @@ psql -v ON_ERROR_STOP=1 -e -h localhost "$DBNAME_BASE" "$USERNAME" <<EOF
|
|||
CREATE SCHEMA zulip;
|
||||
EOF
|
||||
|
||||
PGVERSION=$(psql --version | awk '{split($3, a, "."); print a[1]}')
|
||||
echo "shared_preload_libraries = 'pg_search'" | sudo tee "/etc/postgresql/$PGVERSION/main/conf.d/zulip-dev.conf"
|
||||
sudo systemctl restart "postgresql@${PGVERSION}-main"
|
||||
|
||||
"${ROOT_POSTGRES[@]}" -v ON_ERROR_STOP=1 -e "$DBNAME_BASE" <<EOF
|
||||
CREATE EXTENSION pgroonga;
|
||||
|
||||
|
@ -116,6 +120,8 @@ SELECT unnest(
|
|||
ARRAY['ALTER SYSTEM SET pgroonga.enable_wal = ''on''', 'SELECT pg_reload_conf()']
|
||||
END
|
||||
) \gexec
|
||||
|
||||
CREATE EXTENSION pg_search;
|
||||
EOF
|
||||
|
||||
psql -v ON_ERROR_STOP=1 -e -h localhost postgres "$USERNAME" <<EOF
|
||||
|
|
|
@ -40,6 +40,7 @@ from zerver.lib.message import (
|
|||
get_first_visible_message_id,
|
||||
)
|
||||
from zerver.lib.narrow_predicate import channel_operators, channels_operators
|
||||
from zerver.lib.paradedb import search_operand_to_tantivy_query
|
||||
from zerver.lib.recipient_users import recipient_for_user_profiles
|
||||
from zerver.lib.sqlalchemy_utils import get_sqlalchemy_connection
|
||||
from zerver.lib.streams import (
|
||||
|
@ -757,11 +758,48 @@ class NarrowBuilder:
|
|||
return query.where(maybe_negate(cond))
|
||||
|
||||
def by_search(self, query: Select, operand: str, maybe_negate: ConditionTransform) -> Select:
|
||||
if settings.USING_PGROONGA:
|
||||
if settings.USING_PARADEDB:
|
||||
return self._by_search_paradedb(query, operand, maybe_negate)
|
||||
elif settings.USING_PGROONGA:
|
||||
return self._by_search_pgroonga(query, operand, maybe_negate)
|
||||
else:
|
||||
return self._by_search_tsearch(query, operand, maybe_negate)
|
||||
|
||||
def _by_search_paradedb(
|
||||
self, query: Select, operand: str, maybe_negate: ConditionTransform
|
||||
) -> Select:
|
||||
# See search_operand_to_tantivy_query for details of what queries are supported.
|
||||
text_search_query = search_operand_to_tantivy_query(operand)
|
||||
|
||||
# This makes the tantivy query be passed to Postgres as a bind parameter.
|
||||
# This might interfere in the future with ParadeDB's ability to "push down"
|
||||
# regular SQL conditions such as "AND realm_id = N" into the tantivy query
|
||||
# when they add this smart push down functionality.
|
||||
# All this means for us is that we'll have to verify if it works before
|
||||
# relying on it, and if it doesn't just make sure we insert the relevant
|
||||
# conditions into the tantivy query ourselves when appropriate.
|
||||
#
|
||||
# The benefit of passing the tantivy query in a bind parameter is that
|
||||
# it's more robust against SQL injection if we have bugs in
|
||||
# search_operand_to_tantivy_query.
|
||||
safe_text_search_query = literal(text_search_query)
|
||||
final_search_operand = func.paradedb.parse(safe_text_search_query)
|
||||
|
||||
# TODO: finding the matches for higlighting is not implemented yet
|
||||
# https://github.com/paradedb/paradedb/issues/1778
|
||||
query = query.add_columns(
|
||||
literal_column("ARRAY[]::integer[]").label("content_matches"),
|
||||
literal_column("ARRAY[]::integer[]").label("topic_matches"),
|
||||
)
|
||||
|
||||
if text_search_query is None:
|
||||
# The operand was not parsable or not supported. Return empty results
|
||||
# from the search.
|
||||
return query.where(false())
|
||||
|
||||
condition = literal_column("zerver_message.id").op("@@@")(final_search_operand)
|
||||
return query.where(maybe_negate(condition))
|
||||
|
||||
def _by_search_pgroonga(
|
||||
self, query: Select, operand: str, maybe_negate: ConditionTransform
|
||||
) -> Select:
|
||||
|
@ -1438,6 +1476,9 @@ def fetch_messages(
|
|||
|
||||
# This is a hack to tag the query we use for testing
|
||||
query = query.prefix_with("/* get_messages */")
|
||||
# print("----------- final query -----------")
|
||||
# print(query.compile(compile_kwargs={"literal_binds": True}))
|
||||
# print("-----------------------------------")
|
||||
rows = list(sa_conn.execute(query).fetchall())
|
||||
|
||||
if client_requested_message_ids is not None:
|
||||
|
|
|
@ -0,0 +1,131 @@
|
|||
import html
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class QueryAtom:
|
||||
original_string: str
|
||||
html_escaped_string: str
|
||||
is_phrase: bool
|
||||
|
||||
|
||||
def search_operand_to_tantivy_query(operand: str) -> str | None:
|
||||
# TODO: Implement better parsing here so that we can support
|
||||
# more complex queries; consider exposing some of the tantivy
|
||||
# query language.
|
||||
#
|
||||
# A basic case that should at least be supported is search with phrase
|
||||
# terms and word terms such as
|
||||
# word1 "phrase with spaces" word2 "another phrase"
|
||||
# For now this only supports:
|
||||
# 1) single phrase search: "phrase with spaces"
|
||||
# 2) words search: word1 word2 word3
|
||||
|
||||
# https://docs.paradedb.com/documentation/full-text/term#special-characters
|
||||
special_chars = {
|
||||
"'",
|
||||
'"',
|
||||
"+",
|
||||
"^",
|
||||
"`",
|
||||
":",
|
||||
"{",
|
||||
"}",
|
||||
"[",
|
||||
"]",
|
||||
"(",
|
||||
")",
|
||||
"~",
|
||||
"!",
|
||||
"\\",
|
||||
"*",
|
||||
}
|
||||
# These aren't listed, but empirically seem to be clearly special symbols that need escaping
|
||||
# to be treated literally.
|
||||
# https://github.com/paradedb/paradedb/pull/1827
|
||||
special_chars.update({"<", ">"})
|
||||
|
||||
# The operand may have the form "<some content>", not necessarily alphanumeric.
|
||||
# That's treated as a phrase search.
|
||||
match = re.match(r'^"(.*)"$', operand)
|
||||
if match:
|
||||
extracted = match.group(1) # Return the content inside the quotes
|
||||
if not extracted:
|
||||
return None
|
||||
if '"' in extracted:
|
||||
# It doesn't seem important to support this in phrase search
|
||||
# as it might complicate the parsing logic in terms of figuring out the
|
||||
# boundaries of the phrase.
|
||||
return None
|
||||
|
||||
# Escape HTML as HTML symbols that were typed by the user in a message
|
||||
# end up HTML-escaped in rendered_content. " and ' are preserved literally
|
||||
# in rendered_content, so here we also need to be careful and html.escape(..., quote=False)
|
||||
# to preserve them.
|
||||
#
|
||||
# Note: Until the TODO below is resolved however, these special characters
|
||||
# are not allowed in the search anyway.
|
||||
html_escaped = html.escape(extracted, quote=False)
|
||||
|
||||
# TODO: The exact set of characters that may need escaping and how exactly
|
||||
# to escape them is still somewhat unclear. For now, just don't allow any
|
||||
# special characters. Add proper escaping when possible.
|
||||
if special_chars.intersection(html_escaped):
|
||||
# TODO: Alternatively, we could just remove the chars. Counterpoint: phrase
|
||||
# search is supposed to be very literal, so just clearly refusing to accept
|
||||
# the input might be better than altering the search.
|
||||
return None
|
||||
|
||||
query_atoms = [
|
||||
QueryAtom(original_string=extracted, html_escaped_string=html_escaped, is_phrase=True)
|
||||
]
|
||||
else:
|
||||
# The operand has the form
|
||||
# word1 word2 word3
|
||||
# where each word is a single word. Zulip's interpretation of this
|
||||
# search is "message contains all of these words in concat(subject, rendered_content)".
|
||||
#
|
||||
# The tantivy query reflecting that can be formed as:
|
||||
# (rendered_content:word1 OR subject:word1) AND (rendered_content:word2 OR subject:word2)
|
||||
# AND (rendered_content:word3 OR subject:word3)
|
||||
query_atoms = []
|
||||
|
||||
# Ignore special characters by stripping them out of the words.
|
||||
# TODO: Consider just escaping them once we fully understand how to do that
|
||||
# correctly and for which characters exactly.
|
||||
for word in operand.split():
|
||||
word = word.strip()
|
||||
word = "".join(char for char in word if char not in special_chars)
|
||||
html_escaped_word = html.escape(word, quote=False)
|
||||
query_atoms.append(
|
||||
QueryAtom(
|
||||
original_string=word, html_escaped_string=html_escaped_word, is_phrase=False
|
||||
)
|
||||
)
|
||||
|
||||
or_conditions: list[str] = []
|
||||
for query_atom in query_atoms:
|
||||
# In Zulip, .rendered_content contains HTML-escaped content from the user, while .subject doesn't
|
||||
# escape HTML and thus contains the original content as entered by the user.
|
||||
# For that reason, QueryAtom contains both HTML escaped and original versions of the search query.
|
||||
# We want to match rendered_content against the HTML escaped version, but subject against the original.
|
||||
#
|
||||
# E.g. if we are being queried for <b>hello</b>, we want to search for <b>hello</b> in
|
||||
# rendered_content, but <b>hello</b> in subject. In practice, punctuation seems mostly ignored in our
|
||||
# tantivy queries on the index with English stemming, so this is a fairly theoretical technicality;
|
||||
# but might matter if we change/add tokenizers in the future.
|
||||
if query_atom.is_phrase:
|
||||
assert (
|
||||
len(query_atoms) == 1
|
||||
), "Phrase search doesn't support anything else than a single phrase in the query"
|
||||
or_conditions.append(
|
||||
f'(rendered_content:"{query_atom.html_escaped_string}" OR subject:"{query_atom.original_string}")'
|
||||
)
|
||||
else:
|
||||
or_conditions.append(
|
||||
f"(rendered_content:{query_atom.html_escaped_string} OR subject:{query_atom.original_string})"
|
||||
)
|
||||
|
||||
final_query = " AND ".join(or_conditions)
|
||||
return final_query
|
|
@ -37,6 +37,8 @@ def get_sqlalchemy_connection() -> Iterator[Connection]:
|
|||
creator=get_dj_conn,
|
||||
poolclass=NonClosingPool,
|
||||
pool_reset_on_return=None,
|
||||
# TODO: This is for dev debugging, remove before merging.
|
||||
echo=True,
|
||||
)
|
||||
with sqlalchemy_engine.connect().execution_options(autocommit=False) as sa_connection:
|
||||
yield sa_connection
|
||||
|
|
|
@ -68,6 +68,7 @@ from .configured_settings import (
|
|||
STATIC_URL,
|
||||
SUBMIT_USAGE_STATISTICS,
|
||||
TORNADO_PORTS,
|
||||
USING_PARADEDB,
|
||||
USING_PGROONGA,
|
||||
ZULIP_ADMINISTRATOR,
|
||||
ZULIP_SERVICE_PUSH_NOTIFICATIONS,
|
||||
|
@ -276,7 +277,11 @@ INSTALLED_APPS = [
|
|||
"two_factor",
|
||||
"two_factor.plugins.phonenumber",
|
||||
]
|
||||
if USING_PARADEDB:
|
||||
assert not USING_PGROONGA
|
||||
INSTALLED_APPS += ["paradedb"]
|
||||
if USING_PGROONGA:
|
||||
assert not USING_PARADEDB
|
||||
INSTALLED_APPS += ["pgroonga"]
|
||||
INSTALLED_APPS += EXTRA_INSTALLED_APPS
|
||||
|
||||
|
|
|
@ -424,6 +424,10 @@ REALM_MOBILE_REMAP_URIS: dict[str, str] = {}
|
|||
# testing.
|
||||
USING_PGROONGA = False
|
||||
|
||||
# Whether the server is using the pg_search (paradedb) extension
|
||||
# for full-text search.
|
||||
USING_PARADEDB = False
|
||||
|
||||
# How Django should send emails. Set for most contexts in settings.py, but
|
||||
# available for sysadmin override in unusual cases.
|
||||
EMAIL_BACKEND: str | None = None
|
||||
|
|
|
@ -99,7 +99,8 @@ TERMS_OF_SERVICE_MESSAGE: str | None = "Description of changes to the ToS!"
|
|||
EMBEDDED_BOTS_ENABLED = True
|
||||
|
||||
SYSTEM_ONLY_REALMS: set[str] = set()
|
||||
USING_PGROONGA = True
|
||||
USING_PGROONGA = False
|
||||
USING_PARADEDB = True
|
||||
# Flush cache after migration.
|
||||
POST_MIGRATION_CACHE_FLUSHING = True
|
||||
|
||||
|
|
Loading…
Reference in New Issue