puppet: Remove check_postgresql_backup.

We have replaced this monitoring with the black-box wal-g monitoring,
which is more accurate.
This commit is contained in:
Alex Vandiver 2024-05-09 18:02:27 +00:00 committed by Tim Abbott
parent fef31614d3
commit 230040caa9
7 changed files with 6 additions and 103 deletions

View File

@ -245,10 +245,8 @@ To restore from a manual backup, the process is basically the reverse of the abo
This restoration process can also be used to migrate a Zulip
installation from one server to another.
We recommend running a disaster recovery test after setting up your
backups to confirm that your backups are working. You may also want to
monitor that they are up to date using the Nagios plugin at:
`puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup`.
We recommend running a disaster recovery test after setting up your backups to
confirm that your backups are working.
## Data export
@ -528,9 +526,9 @@ it may be minutes before the backup is saved into S3 -- see
If you need always-current backup availability, Zulip also has
[built-in database replication support](postgresql.md#postgresql-warm-standby).
You can (and should) monitor that backups are running regularly via
the Nagios plugin installed into
`/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup`.
You can (and should) monitor that backups are running regularly, for instance
via the Prometheus exporter found in
`puppet/zulip/files/postgresql/wal-g-exporter`
### Streaming backups to S3

View File

@ -296,7 +296,6 @@ Database monitoring:
- `check_fts_update_log`: Checks whether full-text search updates are
being processed properly or getting backlogged.
- `check_postgres`: General checks for database health.
- `check_postgresql_backup`: Checks status of PostgreSQL backups.
- `check_postgresql_replication_lag`: Checks whether PostgreSQL streaming
replication is up to date.

View File

@ -145,11 +145,6 @@ define command {
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql/check_postgresql_replication_lag'
}
define command {
command_name check_postgresql_backup
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup'
}
define command{
command_name check_worker_memory
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_worker_memory'

View File

@ -273,14 +273,6 @@ define service {
contact_groups admins
}
define service {
use generic-service
service_description Check last PostgreSQL backup time
hostgroup_name postgresql
check_command check_postgresql_backup
contact_groups admins
}
#### Redis

View File

@ -1,66 +0,0 @@
#!/usr/bin/env python3
import subprocess
import sys
from datetime import datetime, timedelta, timezone
import dateutil.parser
states = {
"OK": 0,
"WARNING": 1,
"CRITICAL": 2,
"UNKNOWN": 3,
}
def report(state: str, msg: str) -> None:
print(f"{state}: {msg}")
sys.exit(states[state])
replicas = subprocess.check_output(
[
"psql",
"-v",
"ON_ERROR_STOP=1",
"postgres",
"-t",
"-c",
"SELECT COUNT(*) FROM pg_stat_replication",
],
stdin=subprocess.DEVNULL,
text=True,
).strip()
if int(replicas) > 0:
# We are the primary and we have replicas; we expect that backups
# will be taken on one of them.
report("OK", "this is the primary, with backups taken on the replicas")
skip_backups = subprocess.run(
["crudini", "--get", "/etc/zulip/zulip.conf", "postgresql", "skip_backups"],
capture_output=True,
text=True,
check=False,
)
if skip_backups.returncode == 0 and skip_backups.stdout.strip().lower() in [
1,
"y",
"t",
"yes",
"true",
"enable",
"enabled",
]:
report("OK", "backups are disabled on this host")
try:
with open("/var/lib/nagios_state/last_postgresql_backup") as f:
last_backup = dateutil.parser.parse(f.read())
except OSError:
report("UNKNOWN", "could not determine completion time of last PostgreSQL backup")
if datetime.now(tz=timezone.utc) - last_backup > timedelta(hours=25):
report("CRITICAL", f"last PostgreSQL backup completed more than 25 hours ago: {last_backup}")
report("OK", f"last PostgreSQL backup completed less than 25 hours ago: {last_backup}")

View File

@ -58,11 +58,6 @@ env = os.environ.copy()
env["WALG_UPLOAD_DISK_CONCURRENCY"] = disk_concurrency
subprocess.check_call(["env-wal-g", "backup-push", pg_data_path], env=env)
now = datetime.now(tz=timezone.utc)
with open("/var/lib/nagios_state/last_postgresql_backup", "w") as f:
f.write(now.isoformat())
f.write("\n")
backups = {}
lines = subprocess.check_output(["env-wal-g", "backup-list"], text=True).split("\n")
for line in lines[1:]:
@ -70,7 +65,7 @@ for line in lines[1:]:
backup_name, date_str = line.split()[0:2]
backups[dateutil.parser.parse(date_str)] = backup_name
one_month_ago = now - timedelta(days=30)
one_month_ago = datetime.now(tz=timezone.utc) - timedelta(days=30)
for date in sorted(backups.keys(), reverse=True):
if date < one_month_ago:
# We pass `FIND_FULL` such that if delta backups are being

View File

@ -45,14 +45,4 @@ class zulip::postgresql_backups {
mode => '0600',
}
}
file { "${zulip::common::nagios_plugins_dir}/zulip_postgresql_backups":
require => Package[$zulip::common::nagios_plugins],
recurse => true,
purge => true,
owner => 'root',
group => 'root',
mode => '0755',
source => 'puppet:///modules/zulip/nagios_plugins/zulip_postgresql_backups',
}
}