diff --git a/docs/full-text-search.md b/docs/full-text-search.md index eda806c1e3..fcd1ec5a40 100644 --- a/docs/full-text-search.md +++ b/docs/full-text-search.md @@ -34,6 +34,74 @@ application server instead. ## An optional full-text search implementation -See [the option PGroonga pull -request](https://github.com/zulip/zulip/pull/700) for details on the -status of the PGroonga integration. +Zulip now supports using [PGroonga](http://pgroonga.github.io/) for +full-text search. PGroonga is a PostgreSQL extension that provides +full-text search feature. PostgreSQL's built-in full-text search +feature supports only one language at a time (in Zulip's case, +English). PGroonga supports all languages simultaneously, including +Japanese, Chinese and so on, all at once. We expect to migrate +Zulip's full-text search to only support PGroonga once we have tested +this new extension fully. + +The following processes should be executed as the root user. Run: + + sudo -i + +### How to enable full-text search against all languages + +This section describes how to enable using PGroonga to back the +full-text search feature. + +* To install PGroonga, add `pgroonga = enabled` in the `[machine]` +section in `/etc/zulip/zulip.conf`: + + [machine] + ... + pgroonga = enabled + +And then run as root: + + /home/zulip/deployments/current/scripts/zulip-puppet-apply + +Then, add `USING_PGROONGA = true` in `/etc/zulip/settings.py`: + + USING_PGROONGA = True + +And apply the PGroonga migrations: + + cd /srv/zulip + ./manage.py migrate pgroonga + +Note that the migration may take a long time, and you can't send new +messages until the migration is finished. + +Once the migrations are complete, restart Zulip: + + su zulip -c /home/zulip/deployments/current/scripts/restart-server + +Now, you can use full-text search against all languages. + +### How to disable full-text search against all languages + +This section describes how to disable full-text search feature based +on PGroonga. + +If you want to fully remove PGroonga, first you need to remove the +PGroonga column (as above, this will take a long time and no messages +can be sent while it is running). If you intend to re-enable PGroonga +later, you can skip this step (at the cost of your Message table being +slightly larger than it would be otherwise). + + /home/zulip/deployments/current/manage.py migrate pgroonga zero + +Then, set `USING_PGROONGA = False` in `/etc/zulip/settings.py`: + + USING_PGROONGA = False + +And, restart Zulip: + + su zulip -c /home/zulip/deployments/current/scripts/restart-server + +Now, full-text search feature based on PGroonga is disabled. If you'd +like, you can also remove the `pgroonga = enabled` line in +`/etc/zulip/zulip.conf` and uninstall the `pgroonga` packages. diff --git a/pgroonga/__init__.py b/pgroonga/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/pgroonga/migrations/0001_enable.py b/pgroonga/migrations/0001_enable.py new file mode 100644 index 0000000000..cc008bec87 --- /dev/null +++ b/pgroonga/migrations/0001_enable.py @@ -0,0 +1,45 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +from django.db import models, migrations +from django.contrib.postgres import operations +from django.conf import settings + + +class Migration(migrations.Migration): + + dependencies = [ + ('zerver', '0001_initial'), + ] + + database_setting = settings.DATABASES["default"] + if "postgres" in database_setting["ENGINE"]: + operations = [ + migrations.RunSQL(""" +ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public,pgroonga,pg_catalog; + +SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog; + +ALTER TABLE zerver_message ADD COLUMN search_pgroonga text; + +UPDATE zerver_message SET search_pgroonga = subject || ' ' || rendered_content; + +-- TODO: We want to use CREATE INDEX CONCURRENTLY but it can't be used in +-- transaction. Django uses transaction implicitly. +-- Django 1.10 may solve the problem. +CREATE INDEX zerver_message_search_pgroonga ON zerver_message + USING pgroonga(search_pgroonga pgroonga.text_full_text_search_ops); +""" % database_setting, + """ +SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog; + +DROP INDEX zerver_message_search_pgroonga; +ALTER TABLE zerver_message DROP COLUMN search_pgroonga; + +SET search_path = %(SCHEMA)s,public; + +ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public; +""" % database_setting), + ] + else: + operations = [] diff --git a/pgroonga/migrations/__init__.py b/pgroonga/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/puppet/zulip/files/postgresql/process_fts_updates b/puppet/zulip/files/postgresql/process_fts_updates index dcacdd91ba..b9c4996743 100755 --- a/puppet/zulip/files/postgresql/process_fts_updates +++ b/puppet/zulip/files/postgresql/process_fts_updates @@ -34,6 +34,11 @@ def update_fts_columns(cursor): cursor.execute("SELECT id, message_id FROM fts_update_log;") ids = [] for (id, message_id) in cursor.fetchall(): + if settings.USING_PGROONGA: + cursor.execute("UPDATE zerver_message SET " + "search_pgroonga = " + "subject || ' ' || rendered_content " + "WHERE id = %s", (message_id,)) cursor.execute("UPDATE zerver_message SET " "search_tsvector = to_tsvector('zulip.english_us_search', " "subject || rendered_content) " diff --git a/puppet/zulip/manifests/postgres_appdb_base.pp b/puppet/zulip/manifests/postgres_appdb_base.pp index 0a63129f87..9ad259c33b 100644 --- a/puppet/zulip/manifests/postgres_appdb_base.pp +++ b/puppet/zulip/manifests/postgres_appdb_base.pp @@ -67,7 +67,7 @@ class zulip::postgres_appdb_base { } $pgroonga = zulipconf("machine", "pgroonga", "") - if $pgroonga != "" { + if $pgroonga == "enabled" { apt::ppa {'ppa:groonga/ppa': before => Package["postgresql-${zulip::base::postgres_version}-pgroonga"], } diff --git a/tools/setup/postgres-init-dev-db b/tools/setup/postgres-init-dev-db index e6eada14dd..3d47caa9e6 100755 --- a/tools/setup/postgres-init-dev-db +++ b/tools/setup/postgres-init-dev-db @@ -74,6 +74,8 @@ EOF $ROOT_POSTGRES "$DBNAME_BASE" << EOF CREATE EXTENSION tsearch_extras SCHEMA zulip; +CREATE EXTENSION pgroonga; +GRANT USAGE ON SCHEMA pgroonga TO $USERNAME; EOF psql -h localhost postgres "$USERNAME" <I am hungry!

') + @override_settings(USING_PGROONGA=True) + def test_get_old_messages_with_search_pgroonga(self): + self.login("cordelia@zulip.com") + + messages_to_search = [ + (u'日本語', u'こんにちは。今日はいい天気ですね。'), + (u'日本語', u'今朝はごはんを食べました。'), + (u'日本語', u'昨日、日本のお菓子を送りました。'), + ('english', u'I want to go to 日本!'), + ('english', 'Can you speak Japanese?'), + ] + + for topic, content in messages_to_search: + self.send_message( + sender_name="cordelia@zulip.com", + raw_recipients="Verona", + message_type=Recipient.STREAM, + content=content, + subject=topic, + ) + + # We use brute force here and update our text search index + # for the entire zerver_message table (which is small in test + # mode). In production there is an async process which keeps + # the search index up to date. + with connection.cursor() as cursor: + cursor.execute(""" + UPDATE zerver_message SET + search_pgroonga = subject || ' ' || rendered_content + """) + + narrow = [ + dict(operator='search', operand=u'日本'), + ] + result = self.get_and_check_messages(dict( + narrow=ujson.dumps(narrow), + anchor=0, + num_after=10, + )) + self.assertEqual(len(result['messages']), 4) + messages = result['messages'] + + japanese_message = [m for m in messages if m['subject'] == u'日本語'][-1] + self.assertEqual( + japanese_message['match_subject'], + u'日本語') + self.assertEqual( + japanese_message['match_content'], + u'

昨日、日本の' + + u'お菓子を送りました。

') + + english_message = [m for m in messages if m['subject'] == 'english'][0] + self.assertEqual( + english_message['match_subject'], + 'english') + self.assertEqual( + english_message['match_content'], + u'

I want to go to 日本!

') def test_get_old_messages_with_only_searching_anchor(self): """ @@ -949,6 +1023,7 @@ class GetOldMessagesTest(ZulipTestCase): 'narrow': '[["stream", "Scotland"], ["is", "starred"]]'}, sql) + @override_settings(USING_PGROONGA=False) def test_get_old_messages_with_search_queries(self): query_ids = self.get_query_ids() @@ -969,3 +1044,25 @@ class GetOldMessagesTest(ZulipTestCase): self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, 'narrow': '[["search", "\\"jumping\\" quickly"]]'}, sql) + + @override_settings(USING_PGROONGA=True) + def test_get_old_messages_with_search_queries_pgroonga(self): + query_ids = self.get_query_ids() + + sql_template = u"SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ 'jumping') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" + sql = sql_template.format(**query_ids) + self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, + 'narrow': '[["search", "jumping"]]'}, + sql) + + sql_template = "SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT id AS message_id, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_message \nWHERE recipient_id = 9 AND (search_pgroonga @@ 'jumping') AND zerver_message.id >= 0 ORDER BY zerver_message.id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC" + sql = sql_template.format(**query_ids) + self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, + 'narrow': '[["stream", "Scotland"], ["search", "jumping"]]'}, + sql) + + sql_template = 'SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ \'"jumping" quickly\') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC' + sql = sql_template.format(**query_ids) + self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10, + 'narrow': '[["search", "\\"jumping\\" quickly"]]'}, + sql) diff --git a/zerver/views/messages.py b/zerver/views/messages.py index d54e6da435..426dba5e94 100644 --- a/zerver/views/messages.py +++ b/zerver/views/messages.py @@ -10,7 +10,7 @@ from django.db.models import Q from django.http import HttpRequest, HttpResponse from six import text_type from typing import Any, AnyStr, Iterable, Optional, Tuple -from zerver.lib.str_utils import force_text +from zerver.lib.str_utils import force_bytes, force_text from zerver.decorator import authenticated_api_view, authenticated_json_post_view, \ has_request_variables, REQ, JsonableError, \ @@ -236,6 +236,23 @@ class NarrowBuilder(object): return query.where(maybe_negate(cond)) def by_search(self, query, operand, maybe_negate): + if settings.USING_PGROONGA: + return self._by_search_pgroonga(query, operand, maybe_negate) + else: + return self._by_search_tsearch(query, operand, maybe_negate) + + def _by_search_pgroonga(self, query, operand, maybe_negate): + match_positions_byte = func.pgroonga.match_positions_byte + query_extract_keywords = func.pgroonga.query_extract_keywords + keywords = query_extract_keywords(operand) + query = query.column(match_positions_byte(column("rendered_content"), + keywords).label("content_matches")) + query = query.column(match_positions_byte(column("subject"), + keywords).label("subject_matches")) + condition = column("search_pgroonga").op("@@")(operand) + return query.where(maybe_negate(condition)) + + def _by_search_tsearch(self, query, operand, maybe_negate): tsquery = func.plainto_tsquery(literal("zulip.english_us_search"), literal(operand)) ts_locs_array = func.ts_match_locs_array query = query.column(ts_locs_array(literal("zulip.english_us_search"), @@ -264,7 +281,7 @@ class NarrowBuilder(object): # Apparently, the offsets we get from tsearch_extras are counted in # unicode characters, not in bytes, so we do our processing with text, # not bytes. -def highlight_string(text, locs): +def highlight_string_text_offsets(text, locs): # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type string = force_text(text) highlight_start = u'' @@ -281,6 +298,30 @@ def highlight_string(text, locs): result += string[pos:] return result +def highlight_string_bytes_offsets(text, locs): + # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type + string = force_bytes(text) + highlight_start = b'' + highlight_stop = b'' + pos = 0 + result = b'' + for loc in locs: + (offset, length) = loc + result += string[pos:offset] + result += highlight_start + result += string[offset:offset + length] + result += highlight_stop + pos = offset + length + result += string[pos:] + return force_text(result) + +def highlight_string(text, locs): + # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type + if settings.USING_PGROONGA: + return highlight_string_bytes_offsets(text, locs) + else: + return highlight_string_text_offsets(text, locs) + def get_search_fields(rendered_content, subject, content_matches, subject_matches): # type: (text_type, text_type, Iterable[Tuple[int, int]], Iterable[Tuple[int, int]]) -> Dict[str, text_type] return dict(match_content=highlight_string(rendered_content, content_matches), diff --git a/zproject/dev_settings.py b/zproject/dev_settings.py index 1fafadacb3..91f02dccbb 100644 --- a/zproject/dev_settings.py +++ b/zproject/dev_settings.py @@ -25,3 +25,4 @@ TERMS_OF_SERVICE = 'zproject/terms.md.template' SAVE_FRONTEND_STACKTRACES = True EVENT_LOGS_ENABLED = True SYSTEM_ONLY_REALMS = set() # type: Set[str] +USING_PGROONGA = True diff --git a/zproject/settings.py b/zproject/settings.py index 57fa996aa6..a0afb1bf44 100644 --- a/zproject/settings.py +++ b/zproject/settings.py @@ -173,7 +173,8 @@ DEFAULT_SETTINGS = {'TWITTER_CONSUMER_KEY': '', 'TERMS_OF_SERVICE': None, 'TOS_VERSION': None, 'SYSTEM_ONLY_REALMS': {"zulip.com"}, - 'FIRST_TIME_TOS_TEMPLATE': None + 'FIRST_TIME_TOS_TEMPLATE': None, + 'USING_PGROONGA': False, } for setting_name, setting_val in six.iteritems(DEFAULT_SETTINGS): @@ -332,7 +333,10 @@ INSTALLED_APPS = [ 'guardian', 'pipeline', 'zerver', -] + EXTRA_INSTALLED_APPS +] +if USING_PGROONGA: + INSTALLED_APPS += ['pgroonga'] +INSTALLED_APPS += EXTRA_INSTALLED_APPS ZILENCER_ENABLED = 'zilencer' in INSTALLED_APPS @@ -379,6 +383,16 @@ elif REMOTE_POSTGRES_HOST != '': else: DATABASES['default']['OPTIONS']['sslmode'] = 'verify-full' +if USING_PGROONGA: + # We need to have "pgroonga" schema before "pg_catalog" schema in + # the PostgreSQL search path, because "pgroonga" schema overrides + # the "@@" operator from "pg_catalog" schema, and "pg_catalog" + # schema is searched first if not specified in the search path. + # See also: http://www.postgresql.org/docs/current/static/runtime-config-client.html + pg_options = '-c search_path=%(SCHEMA)s,zulip,public,pgroonga,pg_catalog' % \ + DATABASES['default'] + DATABASES['default']['OPTIONS']['options'] = pg_options + ######################################################################## # RABBITMQ CONFIGURATION ########################################################################