pg_backup_and_purge: Take backups on replicas, if present.

Taking backups on the database primary adds additional disk load,
which can impact the performance of the application.

Switch to taking backups on replicas, if they exist.  Some deployments
may have multiple replicas, and taking backups on all of them is
wasteful and potentially confusing; add a flag to inhibit taking
nightly snapshots on the host.

If the deployment is a single instance of PostgreSQL, with no
replicas, it takes backups as before, modulo the extra flag to allow
skipping taking them.
This commit is contained in:
Alex Vandiver 2023-04-26 01:21:36 +00:00 committed by Tim Abbott
parent 4b35211ca1
commit 19a11c9556
3 changed files with 66 additions and 13 deletions

View File

@ -836,6 +836,14 @@ based on the `pg_hba.conf` file; if you are using password
authentication, you can set a `postgresql_replication_password` secret
for authentication.
#### `skip_backups`
If set to as true value, inhibits the nightly [`wal-g` backups][wal-g] which
would be taken on all non-replicated hosts and [all warm standby
replicas](#postgresql-warm-standby). This is generally only set if you have
multiple warm standby replicas, in order to avoid taking multiple backups, one
per replica.
#### `ssl_ca_file`
Set to the path to the PEM-encoded certificate authority used to

View File

@ -19,13 +19,39 @@ def report(state: str, msg: str) -> None:
sys.exit(states[state])
if (
subprocess.check_output(
["psql", "-v", "ON_ERROR_STOP=1", "postgres", "-t", "-c", "SELECT pg_is_in_recovery()"]
replicas = subprocess.check_output(
[
"psql",
"-v",
"ON_ERROR_STOP=1",
"postgres",
"-t",
"-c",
"SELECT COUNT(*) FROM pg_stat_replication",
],
stdin=subprocess.DEVNULL,
text=True,
).strip()
!= b"f"
):
report("OK", "this is not the primary")
if int(replicas) > 0:
# We are the primary and we have replicas; we expect that backups
# will be taken on one of them.
report("OK", "this is the primary, with backups taken on the replicas")
skip_backups = subprocess.run(
["crudini", "--get", "/etc/zulip/zulip.conf", "postgresql", "skip_backups"],
capture_output=True,
text=True,
)
if skip_backups.returncode == 0 and skip_backups.stdout.strip().lower() in [
1,
"y",
"t",
"yes",
"true",
"enable",
"enabled",
]:
report("OK", "backups are disabled on this host")
try:
with open("/var/lib/nagios_state/last_postgresql_backup") as f:

View File

@ -1,4 +1,5 @@
#!/usr/bin/env python3
import configparser
import glob
import logging
import os
@ -14,14 +15,32 @@ logging.basicConfig(format="%(asctime)s %(levelname)s: %(message)s")
logger = logging.getLogger(__name__)
recovery_val = subprocess.check_output(
["psql", "-v", "ON_ERROR_STOP=1", "-t", "-c", "SELECT pg_is_in_recovery()"],
config_file = configparser.RawConfigParser()
config_file.read("/etc/zulip/zulip.conf")
def get_config(
section: str,
key: str,
default_value: str = "",
) -> str:
if config_file.has_option(section, key):
return config_file.get(section, key)
return default_value
replicas = subprocess.check_output(
["psql", "-v", "ON_ERROR_STOP=1", "-t", "-c", "SELECT COUNT(*) FROM pg_stat_replication"],
stdin=subprocess.DEVNULL,
text=True,
).strip()
# Assertion to check that we're extracting the value correctly.
assert recovery_val in ["t", "f"]
if recovery_val == "t":
# Only run if we're the primary
if int(replicas) > 0:
# We are the primary and we have replicas; we expect that backups
# will be taken on one of them.
sys.exit(0)
skip_backups = get_config("postgresql", "skip_backups", "")
if skip_backups.lower() in [1, "y", "t", "yes", "true", "enable", "enabled"]:
sys.exit(0)
is_rhel_based = os.path.exists("/etc/redhat-release")