diff --git a/docs/full-text-search.md b/docs/full-text-search.md
index eda806c1e3..fcd1ec5a40 100644
--- a/docs/full-text-search.md
+++ b/docs/full-text-search.md
@@ -34,6 +34,74 @@ application server instead.
## An optional full-text search implementation
-See [the option PGroonga pull
-request](https://github.com/zulip/zulip/pull/700) for details on the
-status of the PGroonga integration.
+Zulip now supports using [PGroonga](http://pgroonga.github.io/) for
+full-text search. PGroonga is a PostgreSQL extension that provides
+full-text search feature. PostgreSQL's built-in full-text search
+feature supports only one language at a time (in Zulip's case,
+English). PGroonga supports all languages simultaneously, including
+Japanese, Chinese and so on, all at once. We expect to migrate
+Zulip's full-text search to only support PGroonga once we have tested
+this new extension fully.
+
+The following processes should be executed as the root user. Run:
+
+ sudo -i
+
+### How to enable full-text search against all languages
+
+This section describes how to enable using PGroonga to back the
+full-text search feature.
+
+* To install PGroonga, add `pgroonga = enabled` in the `[machine]`
+section in `/etc/zulip/zulip.conf`:
+
+ [machine]
+ ...
+ pgroonga = enabled
+
+And then run as root:
+
+ /home/zulip/deployments/current/scripts/zulip-puppet-apply
+
+Then, add `USING_PGROONGA = true` in `/etc/zulip/settings.py`:
+
+ USING_PGROONGA = True
+
+And apply the PGroonga migrations:
+
+ cd /srv/zulip
+ ./manage.py migrate pgroonga
+
+Note that the migration may take a long time, and you can't send new
+messages until the migration is finished.
+
+Once the migrations are complete, restart Zulip:
+
+ su zulip -c /home/zulip/deployments/current/scripts/restart-server
+
+Now, you can use full-text search against all languages.
+
+### How to disable full-text search against all languages
+
+This section describes how to disable full-text search feature based
+on PGroonga.
+
+If you want to fully remove PGroonga, first you need to remove the
+PGroonga column (as above, this will take a long time and no messages
+can be sent while it is running). If you intend to re-enable PGroonga
+later, you can skip this step (at the cost of your Message table being
+slightly larger than it would be otherwise).
+
+ /home/zulip/deployments/current/manage.py migrate pgroonga zero
+
+Then, set `USING_PGROONGA = False` in `/etc/zulip/settings.py`:
+
+ USING_PGROONGA = False
+
+And, restart Zulip:
+
+ su zulip -c /home/zulip/deployments/current/scripts/restart-server
+
+Now, full-text search feature based on PGroonga is disabled. If you'd
+like, you can also remove the `pgroonga = enabled` line in
+`/etc/zulip/zulip.conf` and uninstall the `pgroonga` packages.
diff --git a/pgroonga/__init__.py b/pgroonga/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/pgroonga/migrations/0001_enable.py b/pgroonga/migrations/0001_enable.py
new file mode 100644
index 0000000000..cc008bec87
--- /dev/null
+++ b/pgroonga/migrations/0001_enable.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+from django.contrib.postgres import operations
+from django.conf import settings
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('zerver', '0001_initial'),
+ ]
+
+ database_setting = settings.DATABASES["default"]
+ if "postgres" in database_setting["ENGINE"]:
+ operations = [
+ migrations.RunSQL("""
+ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+ALTER TABLE zerver_message ADD COLUMN search_pgroonga text;
+
+UPDATE zerver_message SET search_pgroonga = subject || ' ' || rendered_content;
+
+-- TODO: We want to use CREATE INDEX CONCURRENTLY but it can't be used in
+-- transaction. Django uses transaction implicitly.
+-- Django 1.10 may solve the problem.
+CREATE INDEX zerver_message_search_pgroonga ON zerver_message
+ USING pgroonga(search_pgroonga pgroonga.text_full_text_search_ops);
+""" % database_setting,
+ """
+SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+DROP INDEX zerver_message_search_pgroonga;
+ALTER TABLE zerver_message DROP COLUMN search_pgroonga;
+
+SET search_path = %(SCHEMA)s,public;
+
+ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public;
+""" % database_setting),
+ ]
+ else:
+ operations = []
diff --git a/pgroonga/migrations/__init__.py b/pgroonga/migrations/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/puppet/zulip/files/postgresql/process_fts_updates b/puppet/zulip/files/postgresql/process_fts_updates
index dcacdd91ba..b9c4996743 100755
--- a/puppet/zulip/files/postgresql/process_fts_updates
+++ b/puppet/zulip/files/postgresql/process_fts_updates
@@ -34,6 +34,11 @@ def update_fts_columns(cursor):
cursor.execute("SELECT id, message_id FROM fts_update_log;")
ids = []
for (id, message_id) in cursor.fetchall():
+ if settings.USING_PGROONGA:
+ cursor.execute("UPDATE zerver_message SET "
+ "search_pgroonga = "
+ "subject || ' ' || rendered_content "
+ "WHERE id = %s", (message_id,))
cursor.execute("UPDATE zerver_message SET "
"search_tsvector = to_tsvector('zulip.english_us_search', "
"subject || rendered_content) "
diff --git a/puppet/zulip/manifests/postgres_appdb_base.pp b/puppet/zulip/manifests/postgres_appdb_base.pp
index 0a63129f87..9ad259c33b 100644
--- a/puppet/zulip/manifests/postgres_appdb_base.pp
+++ b/puppet/zulip/manifests/postgres_appdb_base.pp
@@ -67,7 +67,7 @@ class zulip::postgres_appdb_base {
}
$pgroonga = zulipconf("machine", "pgroonga", "")
- if $pgroonga != "" {
+ if $pgroonga == "enabled" {
apt::ppa {'ppa:groonga/ppa':
before => Package["postgresql-${zulip::base::postgres_version}-pgroonga"],
}
diff --git a/tools/setup/postgres-init-dev-db b/tools/setup/postgres-init-dev-db
index e6eada14dd..3d47caa9e6 100755
--- a/tools/setup/postgres-init-dev-db
+++ b/tools/setup/postgres-init-dev-db
@@ -74,6 +74,8 @@ EOF
$ROOT_POSTGRES "$DBNAME_BASE" << EOF
CREATE EXTENSION tsearch_extras SCHEMA zulip;
+CREATE EXTENSION pgroonga;
+GRANT USAGE ON SCHEMA pgroonga TO $USERNAME;
EOF
psql -h localhost postgres "$USERNAME" <
昨日、日本の' + + u'お菓子を送りました。
') + + english_message = [m for m in messages if m['subject'] == 'english'][0] + self.assertEqual( + english_message['match_subject'], + 'english') + self.assertEqual( + english_message['match_content'], + u'I want to go to 日本!
') def test_get_old_messages_with_only_searching_anchor(self): """ @@ -949,6 +1023,7 @@ class GetOldMessagesTest(ZulipTestCase): 'narrow': '[["stream", "Scotland"], ["is", "starred"]]'}, sql) + @override_settings(USING_PGROONGA=False) def test_get_old_messages_with_search_queries(self): query_ids = self.get_query_ids() @@ -969,3 +1044,25 @@ class GetOldMessagesTest(ZulipTestCase): self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, 'narrow': '[["search", "\\"jumping\\" quickly"]]'}, sql) + + @override_settings(USING_PGROONGA=True) + def test_get_old_messages_with_search_queries_pgroonga(self): + query_ids = self.get_query_ids() + + sql_template = u"SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ 'jumping') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" + sql = sql_template.format(**query_ids) + self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, + 'narrow': '[["search", "jumping"]]'}, + sql) + + sql_template = "SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT id AS message_id, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_message \nWHERE recipient_id = 9 AND (search_pgroonga @@ 'jumping') AND zerver_message.id >= 0 ORDER BY zerver_message.id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" + sql = sql_template.format(**query_ids) + self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, + 'narrow': '[["stream", "Scotland"], ["search", "jumping"]]'}, + sql) + + sql_template = 'SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ \'"jumping" quickly\') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC' + sql = sql_template.format(**query_ids) + self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, + 'narrow': '[["search", "\\"jumping\\" quickly"]]'}, + sql) diff --git a/zerver/views/messages.py b/zerver/views/messages.py index d54e6da435..426dba5e94 100644 --- a/zerver/views/messages.py +++ b/zerver/views/messages.py @@ -10,7 +10,7 @@ from django.db.models import Q from django.http import HttpRequest, HttpResponse from six import text_type from typing import Any, AnyStr, Iterable, Optional, Tuple -from zerver.lib.str_utils import force_text +from zerver.lib.str_utils import force_bytes, force_text from zerver.decorator import authenticated_api_view, authenticated_json_post_view, \ has_request_variables, REQ, JsonableError, \ @@ -236,6 +236,23 @@ class NarrowBuilder(object): return query.where(maybe_negate(cond)) def by_search(self, query, operand, maybe_negate): + if settings.USING_PGROONGA: + return self._by_search_pgroonga(query, operand, maybe_negate) + else: + return self._by_search_tsearch(query, operand, maybe_negate) + + def _by_search_pgroonga(self, query, operand, maybe_negate): + match_positions_byte = func.pgroonga.match_positions_byte + query_extract_keywords = func.pgroonga.query_extract_keywords + keywords = query_extract_keywords(operand) + query = query.column(match_positions_byte(column("rendered_content"), + keywords).label("content_matches")) + query = query.column(match_positions_byte(column("subject"), + keywords).label("subject_matches")) + condition = column("search_pgroonga").op("@@")(operand) + return query.where(maybe_negate(condition)) + + def _by_search_tsearch(self, query, operand, maybe_negate): tsquery = func.plainto_tsquery(literal("zulip.english_us_search"), literal(operand)) ts_locs_array = func.ts_match_locs_array query = query.column(ts_locs_array(literal("zulip.english_us_search"), @@ -264,7 +281,7 @@ class NarrowBuilder(object): # Apparently, the offsets we get from tsearch_extras are counted in # unicode characters, not in bytes, so we do our processing with text, # not bytes. -def highlight_string(text, locs): +def highlight_string_text_offsets(text, locs): # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type string = force_text(text) highlight_start = u'' @@ -281,6 +298,30 @@ def highlight_string(text, locs): result += string[pos:] return result +def highlight_string_bytes_offsets(text, locs): + # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type + string = force_bytes(text) + highlight_start = b'' + highlight_stop = b'' + pos = 0 + result = b'' + for loc in locs: + (offset, length) = loc + result += string[pos:offset] + result += highlight_start + result += string[offset:offset + length] + result += highlight_stop + pos = offset + length + result += string[pos:] + return force_text(result) + +def highlight_string(text, locs): + # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type + if settings.USING_PGROONGA: + return highlight_string_bytes_offsets(text, locs) + else: + return highlight_string_text_offsets(text, locs) + def get_search_fields(rendered_content, subject, content_matches, subject_matches): # type: (text_type, text_type, Iterable[Tuple[int, int]], Iterable[Tuple[int, int]]) -> Dict[str, text_type] return dict(match_content=highlight_string(rendered_content, content_matches), diff --git a/zproject/dev_settings.py b/zproject/dev_settings.py index 1fafadacb3..91f02dccbb 100644 --- a/zproject/dev_settings.py +++ b/zproject/dev_settings.py @@ -25,3 +25,4 @@ TERMS_OF_SERVICE = 'zproject/terms.md.template' SAVE_FRONTEND_STACKTRACES = True EVENT_LOGS_ENABLED = True SYSTEM_ONLY_REALMS = set() # type: Set[str] +USING_PGROONGA = True diff --git a/zproject/settings.py b/zproject/settings.py index 57fa996aa6..a0afb1bf44 100644 --- a/zproject/settings.py +++ b/zproject/settings.py @@ -173,7 +173,8 @@ DEFAULT_SETTINGS = {'TWITTER_CONSUMER_KEY': '', 'TERMS_OF_SERVICE': None, 'TOS_VERSION': None, 'SYSTEM_ONLY_REALMS': {"zulip.com"}, - 'FIRST_TIME_TOS_TEMPLATE': None + 'FIRST_TIME_TOS_TEMPLATE': None, + 'USING_PGROONGA': False, } for setting_name, setting_val in six.iteritems(DEFAULT_SETTINGS): @@ -332,7 +333,10 @@ INSTALLED_APPS = [ 'guardian', 'pipeline', 'zerver', -] + EXTRA_INSTALLED_APPS +] +if USING_PGROONGA: + INSTALLED_APPS += ['pgroonga'] +INSTALLED_APPS += EXTRA_INSTALLED_APPS ZILENCER_ENABLED = 'zilencer' in INSTALLED_APPS @@ -379,6 +383,16 @@ elif REMOTE_POSTGRES_HOST != '': else: DATABASES['default']['OPTIONS']['sslmode'] = 'verify-full' +if USING_PGROONGA: + # We need to have "pgroonga" schema before "pg_catalog" schema in + # the PostgreSQL search path, because "pgroonga" schema overrides + # the "@@" operator from "pg_catalog" schema, and "pg_catalog" + # schema is searched first if not specified in the search path. + # See also: http://www.postgresql.org/docs/current/static/runtime-config-client.html + pg_options = '-c search_path=%(SCHEMA)s,zulip,public,pgroonga,pg_catalog' % \ + DATABASES['default'] + DATABASES['default']['OPTIONS']['options'] = pg_options + ######################################################################## # RABBITMQ CONFIGURATION ########################################################################