zulip/scripts/setup/reindex-textual-data

#!/usr/bin/env python3

import argparse
import logging
import os
import sys
import time

sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))
from scripts.lib.setup_path import setup_path

setup_path()

import psycopg2
import psycopg2.extensions
from psycopg2.sql import SQL, Identifier

from scripts.lib.zulip_tools import su_to_zulip

su_to_zulip()
os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
from django.conf import settings

parser = argparse.ArgumentParser(
    description="Reindex all text indexes, for glibc upgrades.  This will take a write lock on every table, unless --concurrently is passed."
)
parser.add_argument(
    "--concurrently", action="store_true", help="reindex concurrently, on Pg ≥ 11; takes longer"
)
parser.add_argument("--force", action="store_true", help="run the reindexing")
options = parser.parse_args()

logging.Formatter.converter = time.gmtime
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("reindex-textual-data")
logger.setLevel(logging.DEBUG)

pg_args = {}
pg_args["host"] = settings.DATABASES["default"]["HOST"]
pg_args["port"] = settings.DATABASES["default"].get("PORT")
pg_args["password"] = settings.DATABASES["default"].get("PASSWORD")
pg_args["user"] = settings.DATABASES["default"]["USER"]
pg_args["dbname"] = settings.DATABASES["default"]["NAME"]
pg_args["sslmode"] = settings.DATABASES["default"]["OPTIONS"].get("sslmode")
pg_args["connect_timeout"] = "600"

# connection_factory=None lets mypy understand the return type
conn = psycopg2.connect(connection_factory=None, **pg_args)
conn.autocommit = True

pg_server_version = conn.server_version
can_concurrently = pg_server_version >= 110000  # Version 11.0.0

if options.concurrently and not can_concurrently:
    raise RuntimeError("Only PostgreSQL 11 and above can REINDEX CONCURRENTLY.")

cursor = conn.cursor()
cursor.execute(
    """
SELECT
        irel.relname AS index_name,
        trel.relname AS table_name,
        pg_size_pretty(pg_table_size(i.indrelid)) AS size
FROM
        pg_index AS i
        JOIN pg_class AS trel ON trel.oid = i.indrelid
        JOIN pg_namespace AS tnsp ON trel.relnamespace = tnsp.oid
        JOIN pg_class AS irel ON irel.oid = i.indexrelid
        JOIN pg_attribute AS a ON a.attrelid = i.indexrelid
        WHERE tnsp.nspname = 'zulip'
        AND a.attcollation != 0
GROUP BY 1, 2, i.indrelid
ORDER BY pg_table_size(i.indrelid) ASC
"""
)
reindex_targets = cursor.fetchall()

if not options.force:
    print("Would reindex the following indexes:")
    for index, _, _ in reindex_targets:
        print(f"  {index}")
    print(
        """

Re-run with --force to reindex these.  Without --concurrently, these
index rebuilds will TAKE A WRITE LOCK on their respective tables,
which means that production traffic will be affected.

On PostgreSQL 11 and above, you can pass --concurrently, which will do
the index rebuilds without taking such a lock. We recommend only using
--concurrently if the Zulip server will be serving traffic while
running this tool, because concurrent reindexing takes longer to
complete even if the server is not running.
"""
    )
    sys.exit(0)

had_failures = False
for index_name, table_name, index_size in reindex_targets:
    if options.concurrently:
        command = SQL("REINDEX INDEX CONCURRENTLY {}").format(Identifier(index_name))
    else:
        command = SQL("REINDEX INDEX {}").format(Identifier(index_name))

    logger.info("%s -- on %s, %s", command, table_name, index_size)
    try:
        cursor.execute(command)
    except psycopg2.OperationalError as e:
        logger.warning("Failed to reindex %s: %s", index_name, e)
        had_failures = True

if not had_failures:
    sys.exit(0)

print(
    """
=========================> REINDEXING FAILED <=========================

Reindexing failed on one or more indexes; this is generally caused
by duplicate rows that had been incorrectly inserted because of
corrupted database indexes.  This happens, for example, when glibc was
upgraded to 2.28 or higher, which has a different ordering of text
("collation") than earlier versions.  Because of the new ordering,
entries in the indexes are not found (because they exist in a
different place), which results in duplicates being created in the
database for columns which should be unique.

There is no generalized tool to fix these duplicate rows, but in most cases
the database can be repaired with some manual work.  We are using
this chat.zulip.org for support around such repairs:

  https://chat.zulip.org/#narrow/stream/31-production-help/topic/database.20corruption

This tool has regenerated all database indexes that do not have duplicate rows.
Once you have manually repaired duplicate rows, you can rerun this command
to regenerate the rest.

It is reasonable to run your Zulip server to avoid downtime while you
plan the manual repair work. Additional duplicate rows may be created
for any corrupted index that has not yet been regenerated. But any new
duplicate rows will be similar to existing duplicate rows that you
already need to manually repair.
    """
)
sys.exit(1)