diff --git a/docs/production/upgrade-or-modify.md b/docs/production/upgrade-or-modify.md index be1cec4861..6a1352e599 100644 --- a/docs/production/upgrade-or-modify.md +++ b/docs/production/upgrade-or-modify.md @@ -288,7 +288,16 @@ instructions for other supported platforms. /home/zulip/deployments/current/scripts/setup/upgrade-postgresql ``` -5. Finally, we need to reinstall the current version of Zulip, which +5. Ubuntu 20.04 has a different version of the low-level glibc + library, which affects how PostgreSQL orders text data (known as + "collations"); this corrupts database indexes that rely on + collations. Regenerate the affected indexes by running: + + ``` + /home/zulip/deployments/current/scripts/setup/reindex-textual-data --force + ``` + +6. Finally, we need to reinstall the current version of Zulip, which among other things will recompile Zulip's Python module dependencies for your new version of Python and rewrite Zulip's full-text search indexes to work with the upgraded dictionary @@ -445,7 +454,16 @@ instructions for other supported platforms. 6. [Upgrade to the latest Zulip release](#upgrading-to-a-release), now that your server is running a supported operating system. -7. As root, finish by verifying the contents of the full-text indexes: +7. Debian Buster has a different version of the low-level glibc + library, which affects how PostgreSQL orders text data (known as + "collations"); this corrupts database indexes that rely on + collations. Regenerate the affected indexes by running: + + ``` + /home/zulip/deployments/current/scripts/setup/reindex-textual-data --force + ``` + +8. As root, finish by verifying the contents of the full-text indexes: ``` /home/zulip/deployments/current/manage.py audit_fts_indexes diff --git a/scripts/setup/reindex-textual-data b/scripts/setup/reindex-textual-data new file mode 100755 index 0000000000..4a817af063 --- /dev/null +++ b/scripts/setup/reindex-textual-data @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 + +import argparse +import logging +import os +import sys +import time + +import psycopg2 +import psycopg2.extensions +from psycopg2.sql import SQL, Identifier + +sys.path.append(os.path.join(os.path.dirname(__file__), "../..")) +from scripts.lib.setup_path import setup_path + +setup_path() + +from scripts.lib.zulip_tools import su_to_zulip + +su_to_zulip() +os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings" +from django.conf import settings + +parser = argparse.ArgumentParser( + description="Reindex all text indexes, for glibc upgrades. This will take a write lock on every table, unless --concurrently is passed." +) +parser.add_argument( + "--concurrently", action="store_true", help="reindex concurrently, on Pg ≥ 11; takes longer" +) +parser.add_argument("--force", action="store_true", help="run the reindexing") +options = parser.parse_args() + +logging.Formatter.converter = time.gmtime +logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s") +logger = logging.getLogger("reindex-textual-data") +logger.setLevel(logging.DEBUG) + +pg_args = {} +pg_args["host"] = settings.DATABASES["default"]["HOST"] +pg_args["port"] = settings.DATABASES["default"].get("PORT") +pg_args["password"] = settings.DATABASES["default"].get("PASSWORD") +pg_args["user"] = settings.DATABASES["default"]["USER"] +pg_args["dbname"] = settings.DATABASES["default"]["NAME"] +pg_args["sslmode"] = settings.DATABASES["default"]["OPTIONS"].get("sslmode") +pg_args["connect_timeout"] = "600" + +conn = psycopg2.connect(**pg_args) +conn.autocommit = True + +pg_server_version = conn.server_version +can_concurrently = pg_server_version >= 110000 # Version 11.0.0 + +if options.concurrently and not can_concurrently: + raise RuntimeError("Only PostgreSQL 11 and above can REINDEX CONCURRENTLY.") + +cursor = conn.cursor() +cursor.execute( + """ +SELECT + irel.relname AS index_name, + trel.relname AS table_name, + pg_size_pretty(pg_table_size(i.indrelid)) AS size +FROM + pg_index AS i + JOIN pg_class AS trel ON trel.oid = i.indrelid + JOIN pg_namespace AS tnsp ON trel.relnamespace = tnsp.oid + JOIN pg_class AS irel ON irel.oid = i.indexrelid + CROSS JOIN unnest(i.indkey) colnum + JOIN pg_attribute AS a ON trel.oid = a.attrelid + AND a.attnum = colnum + WHERE tnsp.nspname = 'zulip' + AND a.attcollation != 0 +GROUP BY 1, 2, i.indrelid +ORDER BY pg_table_size(i.indrelid) ASC +""" +) +reindex_targets = cursor.fetchall() + +if not options.force: + print("Would reindex the following indexes:") + for index, _, _ in reindex_targets: + print(f" {index}") + print( + """ + +Re-run with --force to reindex these. Without --concurrently, these +index rebuilds will TAKE A WRITE LOCK on their respective tables, +which means that production traffic will be affected. + +On PostgreSQL 11 and above, you can pass --concurrently, which will do +the index rebuilds without taking such a lock. We recommend only using +--concurrently if the Zulip server will be serving traffic while +running this tool, because concurrent reindexing takes longer to +complete even if the server is not running. +""" + ) + sys.exit(0) + +had_failures = False +for index_name, table_name, index_size in reindex_targets: + if options.concurrently: + command = SQL("REINDEX INDEX CONCURRENTLY {}").format(Identifier(index_name)) + else: + command = SQL("REINDEX INDEX {}").format(Identifier(index_name)) + + logger.info("%s -- on %s, %s", command, table_name, index_size) + try: + cursor.execute(command) + except psycopg2.OperationalError as e: + logger.warning("Failed to reindex %s: %s", index_name, e) + had_failures = True + +if not had_failures: + sys.exit(0) + +print( + """ +=========================> REINDEXING FAILED <========================= + +Reindexing failed on one or more indexes; this is generally caused +by duplicate rows that had been incorrectly inserted because of +corrupted database indexes. This happens, for example, when glibc was +upgraded to 2.28 or higher, which has a different ordering of text +("collation") than earlier versions. Because of the new ordering, +entries in the indexes are not found (because they exist in a +different place), which results in duplicates being created in the +database for columns which should be unique. + +There is no generalized tool to fix these duplicate rows, but in most cases +the database can be repaired with some manual work. We are using +this chat.zulip.org for support around such repairs: + + https://chat.zulip.org/#narrow/stream/31-production-help/topic/database.20corruption + +This tool has regenerated all database indexes that do not have duplicate rows. +Once you have manually repaired duplicate rows, you can rerun this command +to regenerate the rest. + +It is reasonable to run your Zulip server to avoid downtime while you +plan the manual repair work. Additional duplicate rows may be created +for any corrupted index that has not yet been regenerated. But any new +duplicate rows will be similar to existing duplicate rows that you +already need to manually repair. + """ +) +sys.exit(1)