zulip/scripts/setup/reindex-textual-data

#!/usr/bin/env python3

import argparse
import logging
import os
import sys
import time

import psycopg2
import psycopg2.extensions
from psycopg2.sql import SQL, Identifier

sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))
from scripts.lib.setup_path import setup_path

setup_path()

from scripts.lib.zulip_tools import su_to_zulip

su_to_zulip()
os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"
from django.conf import settings

parser = argparse.ArgumentParser(
    description="Reindex all text indexes, for glibc upgrades.  This will take a write lock on every table, unless --concurrently is passed."
)
parser.add_argument(
    "--concurrently", action="store_true", help="reindex concurrently, on Pg ≥ 11; takes longer"
)
parser.add_argument("--force", action="store_true", help="run the reindexing")
options = parser.parse_args()

logging.Formatter.converter = time.gmtime
logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger("reindex-textual-data")
logger.setLevel(logging.DEBUG)

pg_args = {}
pg_args["host"] = settings.DATABASES["default"]["HOST"]
pg_args["port"] = settings.DATABASES["default"].get("PORT")
pg_args["password"] = settings.DATABASES["default"].get("PASSWORD")
pg_args["user"] = settings.DATABASES["default"]["USER"]
pg_args["dbname"] = settings.DATABASES["default"]["NAME"]
pg_args["sslmode"] = settings.DATABASES["default"]["OPTIONS"].get("sslmode")
pg_args["connect_timeout"] = "600"

conn = psycopg2.connect(**pg_args)
conn.autocommit = True

pg_server_version = conn.server_version
can_concurrently = pg_server_version >= 110000  # Version 11.0.0

if options.concurrently and not can_concurrently:
    raise RuntimeError("Only PostgreSQL 11 and above can REINDEX CONCURRENTLY.")

cursor = conn.cursor()
cursor.execute(
    """
SELECT
        irel.relname AS index_name,
        trel.relname AS table_name,
        pg_size_pretty(pg_table_size(i.indrelid)) AS size
FROM
        pg_index AS i
        JOIN pg_class AS trel ON trel.oid = i.indrelid
        JOIN pg_namespace AS tnsp ON trel.relnamespace = tnsp.oid
        JOIN pg_class AS irel ON irel.oid = i.indexrelid
        CROSS JOIN unnest(i.indkey) colnum
        JOIN pg_attribute AS a ON trel.oid = a.attrelid
                AND a.attnum = colnum
        WHERE tnsp.nspname = 'zulip'
        AND a.attcollation != 0
GROUP BY 1, 2, i.indrelid
ORDER BY pg_table_size(i.indrelid) ASC
"""
)
reindex_targets = cursor.fetchall()

if not options.force:
    print("Would reindex the following indexes:")
    for index, _, _ in reindex_targets:
        print(f"  {index}")
    print(
        """

Re-run with --force to reindex these.  Without --concurrently, these
index rebuilds will TAKE A WRITE LOCK on their respective tables,
which means that production traffic will be affected.

On PostgreSQL 11 and above, you can pass --concurrently, which will do
the index rebuilds without taking such a lock. We recommend only using
--concurrently if the Zulip server will be serving traffic while
running this tool, because concurrent reindexing takes longer to
complete even if the server is not running.
"""
    )
    sys.exit(0)

had_failures = False
for index_name, table_name, index_size in reindex_targets:
    if options.concurrently:
        command = SQL("REINDEX INDEX CONCURRENTLY {}").format(Identifier(index_name))
    else:
        command = SQL("REINDEX INDEX {}").format(Identifier(index_name))

    logger.info("%s -- on %s, %s", command, table_name, index_size)
    try:
        cursor.execute(command)
    except psycopg2.OperationalError as e:
        logger.warning("Failed to reindex %s: %s", index_name, e)
        had_failures = True

if not had_failures:
    sys.exit(0)

print(
    """
=========================> REINDEXING FAILED <=========================

Reindexing failed on one or more indexes; this is generally caused
by duplicate rows that had been incorrectly inserted because of
corrupted database indexes.  This happens, for example, when glibc was
upgraded to 2.28 or higher, which has a different ordering of text
("collation") than earlier versions.  Because of the new ordering,
entries in the indexes are not found (because they exist in a
different place), which results in duplicates being created in the
database for columns which should be unique.

There is no generalized tool to fix these duplicate rows, but in most cases
the database can be repaired with some manual work.  We are using
this chat.zulip.org for support around such repairs:

  https://chat.zulip.org/#narrow/stream/31-production-help/topic/database.20corruption

This tool has regenerated all database indexes that do not have duplicate rows.
Once you have manually repaired duplicate rows, you can rerun this command
to regenerate the rest.

It is reasonable to run your Zulip server to avoid downtime while you
plan the manual repair work. Additional duplicate rows may be created
for any corrupted index that has not yet been regenerated. But any new
duplicate rows will be similar to existing duplicate rows that you
already need to manually repair.
    """
)
sys.exit(1)
reindex-textual-data: Provide a tool to reindex all text indices. The script is added to upgrade steps for 20.04 and Buster because those are the upgrades that cross glibc 2.28, which is most problematic. It will also be called out in the upgrade notes, to catch those that have already done that upgrade. 2021-07-13 04:23:46 +02:00			`#!/usr/bin/env python3`

			`import argparse`
			`import logging`
			`import os`
			`import sys`
			`import time`

			`import psycopg2`
			`import psycopg2.extensions`
			`from psycopg2.sql import SQL, Identifier`

			`sys.path.append(os.path.join(os.path.dirname(__file__), "../.."))`
			`from scripts.lib.setup_path import setup_path`

			`setup_path()`

			`from scripts.lib.zulip_tools import su_to_zulip`

			`su_to_zulip()`
			`os.environ["DJANGO_SETTINGS_MODULE"] = "zproject.settings"`
			`from django.conf import settings`

			`parser = argparse.ArgumentParser(`
			`description="Reindex all text indexes, for glibc upgrades. This will take a write lock on every table, unless --concurrently is passed."`
			`)`
			`parser.add_argument(`
			`"--concurrently", action="store_true", help="reindex concurrently, on Pg ≥ 11; takes longer"`
			`)`
			`parser.add_argument("--force", action="store_true", help="run the reindexing")`
			`options = parser.parse_args()`

			`logging.Formatter.converter = time.gmtime`
			`logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s")`
			`logger = logging.getLogger("reindex-textual-data")`
			`logger.setLevel(logging.DEBUG)`

			`pg_args = {}`
			`pg_args["host"] = settings.DATABASES["default"]["HOST"]`
			`pg_args["port"] = settings.DATABASES["default"].get("PORT")`
			`pg_args["password"] = settings.DATABASES["default"].get("PASSWORD")`
			`pg_args["user"] = settings.DATABASES["default"]["USER"]`
			`pg_args["dbname"] = settings.DATABASES["default"]["NAME"]`
			`pg_args["sslmode"] = settings.DATABASES["default"]["OPTIONS"].get("sslmode")`
			`pg_args["connect_timeout"] = "600"`

			`conn = psycopg2.connect(**pg_args)`
			`conn.autocommit = True`

			`pg_server_version = conn.server_version`
			`can_concurrently = pg_server_version >= 110000 # Version 11.0.0`

			`if options.concurrently and not can_concurrently:`
			`raise RuntimeError("Only PostgreSQL 11 and above can REINDEX CONCURRENTLY.")`

			`cursor = conn.cursor()`
			`cursor.execute(`
			`"""`
			`SELECT`
			`irel.relname AS index_name,`
			`trel.relname AS table_name,`
			`pg_size_pretty(pg_table_size(i.indrelid)) AS size`
			`FROM`
			`pg_index AS i`
			`JOIN pg_class AS trel ON trel.oid = i.indrelid`
			`JOIN pg_namespace AS tnsp ON trel.relnamespace = tnsp.oid`
			`JOIN pg_class AS irel ON irel.oid = i.indexrelid`
			`CROSS JOIN unnest(i.indkey) colnum`
			`JOIN pg_attribute AS a ON trel.oid = a.attrelid`
			`AND a.attnum = colnum`
			`WHERE tnsp.nspname = 'zulip'`
			`AND a.attcollation != 0`
			`GROUP BY 1, 2, i.indrelid`
			`ORDER BY pg_table_size(i.indrelid) ASC`
			`"""`
			`)`
			`reindex_targets = cursor.fetchall()`

			`if not options.force:`
			`print("Would reindex the following indexes:")`
			`for index, _, _ in reindex_targets:`
			`print(f" {index}")`
			`print(`
			`"""`

			`Re-run with --force to reindex these. Without --concurrently, these`
			`index rebuilds will TAKE A WRITE LOCK on their respective tables,`
			`which means that production traffic will be affected.`

			`On PostgreSQL 11 and above, you can pass --concurrently, which will do`
			`the index rebuilds without taking such a lock. We recommend only using`
			`--concurrently if the Zulip server will be serving traffic while`
			`running this tool, because concurrent reindexing takes longer to`
			`complete even if the server is not running.`
			`"""`
			`)`
			`sys.exit(0)`

			`had_failures = False`
			`for index_name, table_name, index_size in reindex_targets:`
			`if options.concurrently:`
			`command = SQL("REINDEX INDEX CONCURRENTLY {}").format(Identifier(index_name))`
			`else:`
			`command = SQL("REINDEX INDEX {}").format(Identifier(index_name))`

			`logger.info("%s -- on %s, %s", command, table_name, index_size)`
			`try:`
			`cursor.execute(command)`
			`except psycopg2.OperationalError as e:`
			`logger.warning("Failed to reindex %s: %s", index_name, e)`
			`had_failures = True`

			`if not had_failures:`
			`sys.exit(0)`

			`print(`
			`"""`
			`=========================> REINDEXING FAILED <=========================`

			`Reindexing failed on one or more indexes; this is generally caused`
			`by duplicate rows that had been incorrectly inserted because of`
			`corrupted database indexes. This happens, for example, when glibc was`
			`upgraded to 2.28 or higher, which has a different ordering of text`
			`("collation") than earlier versions. Because of the new ordering,`
			`entries in the indexes are not found (because they exist in a`
			`different place), which results in duplicates being created in the`
			`database for columns which should be unique.`

			`There is no generalized tool to fix these duplicate rows, but in most cases`
			`the database can be repaired with some manual work. We are using`
			`this chat.zulip.org for support around such repairs:`

			`https://chat.zulip.org/#narrow/stream/31-production-help/topic/database.20corruption`

			`This tool has regenerated all database indexes that do not have duplicate rows.`
			`Once you have manually repaired duplicate rows, you can rerun this command`
			`to regenerate the rest.`

			`It is reasonable to run your Zulip server to avoid downtime while you`
			`plan the manual repair work. Additional duplicate rows may be created`
			`for any corrupted index that has not yet been regenerated. But any new`
			`duplicate rows will be similar to existing duplicate rows that you`
			`already need to manually repair.`
			`"""`
			`)`
			`sys.exit(1)`