From 683f49aa99ab40c457b94d7009890eea12b93c96 Mon Sep 17 00:00:00 2001
From: Kouhei Sutou <kou@clear-code.com>
Date: Mon, 25 Apr 2016 00:08:51 +0900
Subject: [PATCH] Support full text search for all languages using pgroonga.

This adds support for using PGroonga to back the Zulip full-text
search feature.  Because built-in PostgreSQL full text search doesn't
support languages that don't put space between terms such as Japanese,
Chinese and so on. PGroonga supports all languages including Japanese
and Chinese.

Developers will need to re-provision when rebasing past this patch for
the tests to pass, since provision is what installs the PGroonga
package and extension.

PGroonga is enabled by default in development but not in production;
the hope is that after the PGroonga support is tested further, we can
enable it by default.

Fixes #615.

[docs and tests tweaked by tabbott]
---
 docs/full-text-search.md                      |  74 +++++++++++-
 pgroonga/__init__.py                          |   0
 pgroonga/migrations/0001_enable.py            |  45 ++++++++
 pgroonga/migrations/__init__.py               |   0
 .../files/postgresql/process_fts_updates      |   5 +
 puppet/zulip/manifests/postgres_appdb_base.pp |   2 +-
 tools/setup/postgres-init-dev-db              |   2 +
 zerver/tests/test_narrow.py                   | 105 +++++++++++++++++-
 zerver/views/messages.py                      |  45 +++++++-
 zproject/dev_settings.py                      |   1 +
 zproject/settings.py                          |  18 ++-
 11 files changed, 285 insertions(+), 12 deletions(-)
 create mode 100644 pgroonga/__init__.py
 create mode 100644 pgroonga/migrations/0001_enable.py
 create mode 100644 pgroonga/migrations/__init__.py

diff --git a/docs/full-text-search.md b/docs/full-text-search.md
index eda806c1e3..fcd1ec5a40 100644
--- a/docs/full-text-search.md
+++ b/docs/full-text-search.md
@@ -34,6 +34,74 @@ application server instead.
 
 ## An optional full-text search implementation
 
-See [the option PGroonga pull
-request](https://github.com/zulip/zulip/pull/700) for details on the
-status of the PGroonga integration.
+Zulip now supports using [PGroonga](http://pgroonga.github.io/) for
+full-text search. PGroonga is a PostgreSQL extension that provides
+full-text search feature. PostgreSQL's built-in full-text search
+feature supports only one language at a time (in Zulip's case,
+English).  PGroonga supports all languages simultaneously, including
+Japanese, Chinese and so on, all at once.  We expect to migrate
+Zulip's full-text search to only support PGroonga once we have tested
+this new extension fully.
+
+The following processes should be executed as the root user. Run:
+
+    sudo -i
+
+### How to enable full-text search against all languages
+
+This section describes how to enable using PGroonga to back the
+full-text search feature.
+
+* To install PGroonga, add `pgroonga = enabled` in the `[machine]`
+section in `/etc/zulip/zulip.conf`:
+
+    [machine]
+    ...
+    pgroonga = enabled
+
+And then run as root:
+
+    /home/zulip/deployments/current/scripts/zulip-puppet-apply
+
+Then, add `USING_PGROONGA = true` in `/etc/zulip/settings.py`:
+
+    USING_PGROONGA = True
+
+And apply the PGroonga migrations:
+
+    cd /srv/zulip
+    ./manage.py migrate pgroonga
+
+Note that the migration may take a long time, and you can't send new
+messages until the migration is finished.
+
+Once the migrations are complete, restart Zulip:
+
+    su zulip -c /home/zulip/deployments/current/scripts/restart-server
+
+Now, you can use full-text search against all languages.
+
+### How to disable full-text search against all languages
+
+This section describes how to disable full-text search feature based
+on PGroonga.
+
+If you want to fully remove PGroonga, first you need to remove the
+PGroonga column (as above, this will take a long time and no messages
+can be sent while it is running).  If you intend to re-enable PGroonga
+later, you can skip this step (at the cost of your Message table being
+slightly larger than it would be otherwise).
+
+    /home/zulip/deployments/current/manage.py migrate pgroonga zero
+
+Then, set `USING_PGROONGA = False` in `/etc/zulip/settings.py`:
+
+    USING_PGROONGA = False
+
+And, restart Zulip:
+
+    su zulip -c /home/zulip/deployments/current/scripts/restart-server
+
+Now, full-text search feature based on PGroonga is disabled.  If you'd
+like, you can also remove the `pgroonga = enabled` line in
+`/etc/zulip/zulip.conf` and uninstall the `pgroonga` packages.
diff --git a/pgroonga/__init__.py b/pgroonga/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/pgroonga/migrations/0001_enable.py b/pgroonga/migrations/0001_enable.py
new file mode 100644
index 0000000000..cc008bec87
--- /dev/null
+++ b/pgroonga/migrations/0001_enable.py
@@ -0,0 +1,45 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
+
+from django.db import models, migrations
+from django.contrib.postgres import operations
+from django.conf import settings
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('zerver', '0001_initial'),
+    ]
+
+    database_setting = settings.DATABASES["default"]
+    if "postgres" in database_setting["ENGINE"]:
+        operations = [
+            migrations.RunSQL("""
+ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+ALTER TABLE zerver_message ADD COLUMN search_pgroonga text;
+
+UPDATE zerver_message SET search_pgroonga = subject || ' ' || rendered_content;
+
+-- TODO: We want to use CREATE INDEX CONCURRENTLY but it can't be used in
+-- transaction. Django uses transaction implicitly.
+-- Django 1.10 may solve the problem.
+CREATE INDEX zerver_message_search_pgroonga ON zerver_message
+  USING pgroonga(search_pgroonga pgroonga.text_full_text_search_ops);
+""" % database_setting,
+                              """
+SET search_path = %(SCHEMA)s,public,pgroonga,pg_catalog;
+
+DROP INDEX zerver_message_search_pgroonga;
+ALTER TABLE zerver_message DROP COLUMN search_pgroonga;
+
+SET search_path = %(SCHEMA)s,public;
+
+ALTER ROLE %(USER)s SET search_path TO %(SCHEMA)s,public;
+""" % database_setting),
+        ]
+    else:
+        operations = []
diff --git a/pgroonga/migrations/__init__.py b/pgroonga/migrations/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/puppet/zulip/files/postgresql/process_fts_updates b/puppet/zulip/files/postgresql/process_fts_updates
index dcacdd91ba..b9c4996743 100755
--- a/puppet/zulip/files/postgresql/process_fts_updates
+++ b/puppet/zulip/files/postgresql/process_fts_updates
@@ -34,6 +34,11 @@ def update_fts_columns(cursor):
     cursor.execute("SELECT id, message_id FROM fts_update_log;")
     ids = []
     for (id, message_id) in cursor.fetchall():
+        if settings.USING_PGROONGA:
+            cursor.execute("UPDATE zerver_message SET "
+                           "search_pgroonga = "
+                           "subject || ' ' || rendered_content "
+                           "WHERE id = %s", (message_id,))
         cursor.execute("UPDATE zerver_message SET "
                        "search_tsvector = to_tsvector('zulip.english_us_search', "
                        "subject || rendered_content) "
diff --git a/puppet/zulip/manifests/postgres_appdb_base.pp b/puppet/zulip/manifests/postgres_appdb_base.pp
index 0a63129f87..9ad259c33b 100644
--- a/puppet/zulip/manifests/postgres_appdb_base.pp
+++ b/puppet/zulip/manifests/postgres_appdb_base.pp
@@ -67,7 +67,7 @@ class zulip::postgres_appdb_base {
   }
 
   $pgroonga = zulipconf("machine", "pgroonga", "")
-  if $pgroonga != "" {
+  if $pgroonga == "enabled" {
     apt::ppa {'ppa:groonga/ppa':
       before => Package["postgresql-${zulip::base::postgres_version}-pgroonga"],
     }
diff --git a/tools/setup/postgres-init-dev-db b/tools/setup/postgres-init-dev-db
index e6eada14dd..3d47caa9e6 100755
--- a/tools/setup/postgres-init-dev-db
+++ b/tools/setup/postgres-init-dev-db
@@ -74,6 +74,8 @@ EOF
 
 $ROOT_POSTGRES "$DBNAME_BASE" << EOF
 CREATE EXTENSION tsearch_extras SCHEMA zulip;
+CREATE EXTENSION pgroonga;
+GRANT USAGE ON SCHEMA pgroonga TO $USERNAME;
 EOF
 
 psql -h localhost postgres "$USERNAME" <<EOF
diff --git a/zerver/tests/test_narrow.py b/zerver/tests/test_narrow.py
index 1d4f141250..7b3a7da852 100644
--- a/zerver/tests/test_narrow.py
+++ b/zerver/tests/test_narrow.py
@@ -1,6 +1,9 @@
+# -*- coding: utf-8 -*-
+
 from __future__ import absolute_import
 from __future__ import print_function
 from django.db import connection
+from django.test import override_settings
 from sqlalchemy.sql import (
     and_, select, column, compiler
 )
@@ -163,14 +166,26 @@ class NarrowBuilderTest(ZulipTestCase):
         term = dict(operator='id', operand=555, negated=True)
         self._do_add_term_test(term, 'WHERE id != :param_1')
 
+    @override_settings(USING_PGROONGA=False)
     def test_add_term_using_search_operator(self):
         term = dict(operator='search', operand='"french fries"')
         self._do_add_term_test(term, 'WHERE (lower(content) LIKE lower(:content_1) OR lower(subject) LIKE lower(:subject_1)) AND (search_tsvector @@ plainto_tsquery(:param_2, :param_3))')
 
+    @override_settings(USING_PGROONGA=False)
     def test_add_term_using_search_operator_and_negated(self):  # NEGATED
         term = dict(operator='search', operand='"french fries"', negated=True)
         self._do_add_term_test(term, 'WHERE NOT (lower(content) LIKE lower(:content_1) OR lower(subject) LIKE lower(:subject_1)) AND NOT (search_tsvector @@ plainto_tsquery(:param_2, :param_3))')
 
+    @override_settings(USING_PGROONGA=True)
+    def test_add_term_using_search_operator_pgroonga(self):
+        term = dict(operator='search', operand='"french fries"')
+        self._do_add_term_test(term, 'WHERE search_pgroonga @@ :search_pgroonga_1')
+
+    @override_settings(USING_PGROONGA=True)
+    def test_add_term_using_search_operator_and_negated_pgroonga(self):  # NEGATED
+        term = dict(operator='search', operand='"french fries"', negated=True)
+        self._do_add_term_test(term, 'WHERE NOT (search_pgroonga @@ :search_pgroonga_1)')
+
     def test_add_term_using_has_operator_and_attachment_operand(self):
         term = dict(operator='has', operand='attachment')
         self._do_add_term_test(term, 'WHERE has_attachment')
@@ -487,6 +502,7 @@ class GetOldMessagesTest(ZulipTestCase):
         for message in result["messages"]:
             self.assertEqual(message["sender_email"], "othello@zulip.com")
 
+    @override_settings(USING_PGROONGA=False)
     def test_get_old_messages_with_search(self):
         self.login("cordelia@zulip.com")
 
@@ -513,10 +529,10 @@ class GetOldMessagesTest(ZulipTestCase):
         # the search index up to date.
         with connection.cursor() as cursor:
             cursor.execute("""
-                UPDATE zerver_message SET
-                search_tsvector = to_tsvector('zulip.english_us_search',
-                subject || rendered_content)
-                """)
+            UPDATE zerver_message SET
+            search_tsvector = to_tsvector('zulip.english_us_search',
+            subject || rendered_content)
+            """)
 
         narrow = [
             dict(operator='sender', operand='cordelia@zulip.com'),
@@ -547,6 +563,64 @@ class GetOldMessagesTest(ZulipTestCase):
             meeting_message['match_content'],
             '<p>I am hungry!</p>')
 
+    @override_settings(USING_PGROONGA=True)
+    def test_get_old_messages_with_search_pgroonga(self):
+        self.login("cordelia@zulip.com")
+
+        messages_to_search = [
+            (u'日本語', u'こんにちは。今日はいい天気ですね。'),
+            (u'日本語', u'今朝はごはんを食べました。'),
+            (u'日本語', u'昨日、日本のお菓子を送りました。'),
+            ('english', u'I want to go to 日本!'),
+            ('english', 'Can you speak Japanese?'),
+        ]
+
+        for topic, content in messages_to_search:
+            self.send_message(
+                sender_name="cordelia@zulip.com",
+                raw_recipients="Verona",
+                message_type=Recipient.STREAM,
+                content=content,
+                subject=topic,
+            )
+
+        # We use brute force here and update our text search index
+        # for the entire zerver_message table (which is small in test
+        # mode).  In production there is an async process which keeps
+        # the search index up to date.
+        with connection.cursor() as cursor:
+            cursor.execute("""
+                UPDATE zerver_message SET
+                search_pgroonga = subject || ' ' || rendered_content
+                """)
+
+        narrow = [
+            dict(operator='search', operand=u'日本'),
+        ]
+        result = self.get_and_check_messages(dict(
+            narrow=ujson.dumps(narrow),
+            anchor=0,
+            num_after=10,
+        ))
+        self.assertEqual(len(result['messages']), 4)
+        messages = result['messages']
+
+        japanese_message = [m for m in messages if m['subject'] == u'日本語'][-1]
+        self.assertEqual(
+            japanese_message['match_subject'],
+            u'<span class="highlight">日本</span>語')
+        self.assertEqual(
+            japanese_message['match_content'],
+            u'<p>昨日、<span class="highlight">日本</span>の' +
+            u'お菓子を送りました。</p>')
+
+        english_message = [m for m in messages if m['subject'] == 'english'][0]
+        self.assertEqual(
+            english_message['match_subject'],
+            'english')
+        self.assertEqual(
+            english_message['match_content'],
+            u'<p>I want to go to <span class="highlight">日本</span>!</p>')
 
     def test_get_old_messages_with_only_searching_anchor(self):
         """
@@ -949,6 +1023,7 @@ class GetOldMessagesTest(ZulipTestCase):
                                                   'narrow': '[["stream", "Scotland"], ["is", "starred"]]'},
                                                  sql)
 
+    @override_settings(USING_PGROONGA=False)
     def test_get_old_messages_with_search_queries(self):
         query_ids = self.get_query_ids()
 
@@ -969,3 +1044,25 @@ class GetOldMessagesTest(ZulipTestCase):
         self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10,
                                                   'narrow': '[["search", "\\"jumping\\" quickly"]]'},
                                                  sql)
+
+    @override_settings(USING_PGROONGA=True)
+    def test_get_old_messages_with_search_queries_pgroonga(self):
+        query_ids = self.get_query_ids()
+
+        sql_template = u"SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ 'jumping') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC"
+        sql = sql_template.format(**query_ids)
+        self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10,
+                                                  'narrow': '[["search", "jumping"]]'},
+                                                 sql)
+
+        sql_template = "SELECT anon_1.message_id, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT id AS message_id, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords('jumping')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords('jumping')) AS subject_matches \nFROM zerver_message \nWHERE recipient_id = 9 AND (search_pgroonga @@ 'jumping') AND zerver_message.id >= 0 ORDER BY zerver_message.id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC"
+        sql = sql_template.format(**query_ids)
+        self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10,
+                                                  'narrow': '[["stream", "Scotland"], ["search", "jumping"]]'},
+                                                 sql)
+
+        sql_template = 'SELECT anon_1.message_id, anon_1.flags, anon_1.subject, anon_1.rendered_content, anon_1.content_matches, anon_1.subject_matches \nFROM (SELECT message_id, flags, subject, rendered_content, pgroonga.match_positions_byte(rendered_content, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS content_matches, pgroonga.match_positions_byte(subject, pgroonga.query_extract_keywords(\'"jumping" quickly\')) AS subject_matches \nFROM zerver_usermessage JOIN zerver_message ON zerver_usermessage.message_id = zerver_message.id \nWHERE user_profile_id = 2 AND (search_pgroonga @@ \'"jumping" quickly\') AND message_id >= 0 ORDER BY message_id ASC \n LIMIT 10) AS anon_1 ORDER BY message_id ASC'
+        sql = sql_template.format(**query_ids)
+        self.common_check_get_old_messages_query({'anchor': 0, 'num_before': 0, 'num_after': 10,
+                                                  'narrow': '[["search", "\\"jumping\\" quickly"]]'},
+                                                 sql)
diff --git a/zerver/views/messages.py b/zerver/views/messages.py
index d54e6da435..426dba5e94 100644
--- a/zerver/views/messages.py
+++ b/zerver/views/messages.py
@@ -10,7 +10,7 @@ from django.db.models import Q
 from django.http import HttpRequest, HttpResponse
 from six import text_type
 from typing import Any, AnyStr, Iterable, Optional, Tuple
-from zerver.lib.str_utils import force_text
+from zerver.lib.str_utils import force_bytes, force_text
 
 from zerver.decorator import authenticated_api_view, authenticated_json_post_view, \
     has_request_variables, REQ, JsonableError, \
@@ -236,6 +236,23 @@ class NarrowBuilder(object):
             return query.where(maybe_negate(cond))
 
     def by_search(self, query, operand, maybe_negate):
+        if settings.USING_PGROONGA:
+            return self._by_search_pgroonga(query, operand, maybe_negate)
+        else:
+            return self._by_search_tsearch(query, operand, maybe_negate)
+
+    def _by_search_pgroonga(self, query, operand, maybe_negate):
+        match_positions_byte = func.pgroonga.match_positions_byte
+        query_extract_keywords = func.pgroonga.query_extract_keywords
+        keywords = query_extract_keywords(operand)
+        query = query.column(match_positions_byte(column("rendered_content"),
+                                                  keywords).label("content_matches"))
+        query = query.column(match_positions_byte(column("subject"),
+                                                  keywords).label("subject_matches"))
+        condition = column("search_pgroonga").op("@@")(operand)
+        return query.where(maybe_negate(condition))
+
+    def _by_search_tsearch(self, query, operand, maybe_negate):
         tsquery = func.plainto_tsquery(literal("zulip.english_us_search"), literal(operand))
         ts_locs_array = func.ts_match_locs_array
         query = query.column(ts_locs_array(literal("zulip.english_us_search"),
@@ -264,7 +281,7 @@ class NarrowBuilder(object):
 # Apparently, the offsets we get from tsearch_extras are counted in
 # unicode characters, not in bytes, so we do our processing with text,
 # not bytes.
-def highlight_string(text, locs):
+def highlight_string_text_offsets(text, locs):
     # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type
     string = force_text(text)
     highlight_start = u'<span class="highlight">'
@@ -281,6 +298,30 @@ def highlight_string(text, locs):
     result += string[pos:]
     return result
 
+def highlight_string_bytes_offsets(text, locs):
+    # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type
+    string = force_bytes(text)
+    highlight_start = b'<span class="highlight">'
+    highlight_stop = b'</span>'
+    pos = 0
+    result = b''
+    for loc in locs:
+        (offset, length) = loc
+        result += string[pos:offset]
+        result += highlight_start
+        result += string[offset:offset + length]
+        result += highlight_stop
+        pos = offset + length
+    result += string[pos:]
+    return force_text(result)
+
+def highlight_string(text, locs):
+    # type: (AnyStr, Iterable[Tuple[int, int]]) -> text_type
+    if settings.USING_PGROONGA:
+        return highlight_string_bytes_offsets(text, locs)
+    else:
+        return highlight_string_text_offsets(text, locs)
+
 def get_search_fields(rendered_content, subject, content_matches, subject_matches):
     # type: (text_type, text_type, Iterable[Tuple[int, int]], Iterable[Tuple[int, int]]) -> Dict[str, text_type]
     return dict(match_content=highlight_string(rendered_content, content_matches),
diff --git a/zproject/dev_settings.py b/zproject/dev_settings.py
index 1fafadacb3..91f02dccbb 100644
--- a/zproject/dev_settings.py
+++ b/zproject/dev_settings.py
@@ -25,3 +25,4 @@ TERMS_OF_SERVICE = 'zproject/terms.md.template'
 SAVE_FRONTEND_STACKTRACES = True
 EVENT_LOGS_ENABLED = True
 SYSTEM_ONLY_REALMS = set() # type: Set[str]
+USING_PGROONGA = True
diff --git a/zproject/settings.py b/zproject/settings.py
index 57fa996aa6..a0afb1bf44 100644
--- a/zproject/settings.py
+++ b/zproject/settings.py
@@ -173,7 +173,8 @@ DEFAULT_SETTINGS = {'TWITTER_CONSUMER_KEY': '',
                     'TERMS_OF_SERVICE': None,
                     'TOS_VERSION': None,
                     'SYSTEM_ONLY_REALMS': {"zulip.com"},
-                    'FIRST_TIME_TOS_TEMPLATE': None
+                    'FIRST_TIME_TOS_TEMPLATE': None,
+                    'USING_PGROONGA': False,
                     }
 
 for setting_name, setting_val in six.iteritems(DEFAULT_SETTINGS):
@@ -332,7 +333,10 @@ INSTALLED_APPS = [
     'guardian',
     'pipeline',
     'zerver',
-] + EXTRA_INSTALLED_APPS
+]
+if USING_PGROONGA:
+    INSTALLED_APPS += ['pgroonga']
+INSTALLED_APPS += EXTRA_INSTALLED_APPS
 
 ZILENCER_ENABLED = 'zilencer' in INSTALLED_APPS
 
@@ -379,6 +383,16 @@ elif REMOTE_POSTGRES_HOST != '':
     else:
         DATABASES['default']['OPTIONS']['sslmode'] = 'verify-full'
 
+if USING_PGROONGA:
+    # We need to have "pgroonga" schema before "pg_catalog" schema in
+    # the PostgreSQL search path, because "pgroonga" schema overrides
+    # the "@@" operator from "pg_catalog" schema, and "pg_catalog"
+    # schema is searched first if not specified in the search path.
+    # See also: http://www.postgresql.org/docs/current/static/runtime-config-client.html
+    pg_options = '-c search_path=%(SCHEMA)s,zulip,public,pgroonga,pg_catalog' % \
+        DATABASES['default']
+    DATABASES['default']['OPTIONS']['options'] = pg_options
+
 ########################################################################
 # RABBITMQ CONFIGURATION
 ########################################################################