diff --git a/docs/production/export-and-import.md b/docs/production/export-and-import.md index 98e78fad09..6dc859737b 100644 --- a/docs/production/export-and-import.md +++ b/docs/production/export-and-import.md @@ -245,10 +245,8 @@ To restore from a manual backup, the process is basically the reverse of the abo This restoration process can also be used to migrate a Zulip installation from one server to another. -We recommend running a disaster recovery test after setting up your -backups to confirm that your backups are working. You may also want to -monitor that they are up to date using the Nagios plugin at: -`puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup`. +We recommend running a disaster recovery test after setting up your backups to +confirm that your backups are working. ## Data export @@ -528,9 +526,9 @@ it may be minutes before the backup is saved into S3 -- see If you need always-current backup availability, Zulip also has [built-in database replication support](postgresql.md#postgresql-warm-standby). -You can (and should) monitor that backups are running regularly via -the Nagios plugin installed into -`/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup`. +You can (and should) monitor that backups are running regularly, for instance +via the Prometheus exporter found in +`puppet/zulip/files/postgresql/wal-g-exporter` ### Streaming backups to S3 diff --git a/docs/production/troubleshooting.md b/docs/production/troubleshooting.md index 6d27dee330..c4f79d1440 100644 --- a/docs/production/troubleshooting.md +++ b/docs/production/troubleshooting.md @@ -296,7 +296,6 @@ Database monitoring: - `check_fts_update_log`: Checks whether full-text search updates are being processed properly or getting backlogged. - `check_postgres`: General checks for database health. -- `check_postgresql_backup`: Checks status of PostgreSQL backups. - `check_postgresql_replication_lag`: Checks whether PostgreSQL streaming replication is up to date. diff --git a/puppet/kandra/files/nagios4/commands.cfg b/puppet/kandra/files/nagios4/commands.cfg index a2ecc9bf9a..f1accba482 100644 --- a/puppet/kandra/files/nagios4/commands.cfg +++ b/puppet/kandra/files/nagios4/commands.cfg @@ -145,11 +145,6 @@ define command { command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql/check_postgresql_replication_lag' } -define command { - command_name check_postgresql_backup - command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup' -} - define command{ command_name check_worker_memory command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_worker_memory' diff --git a/puppet/kandra/files/nagios4/conf.d/services.cfg b/puppet/kandra/files/nagios4/conf.d/services.cfg index c056790ea4..794bfb14a8 100644 --- a/puppet/kandra/files/nagios4/conf.d/services.cfg +++ b/puppet/kandra/files/nagios4/conf.d/services.cfg @@ -273,14 +273,6 @@ define service { contact_groups admins } -define service { - use generic-service - service_description Check last PostgreSQL backup time - hostgroup_name postgresql - check_command check_postgresql_backup - contact_groups admins -} - #### Redis diff --git a/puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup b/puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup deleted file mode 100755 index ead5f96d80..0000000000 --- a/puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 - -import subprocess -import sys -from datetime import datetime, timedelta, timezone - -import dateutil.parser - -states = { - "OK": 0, - "WARNING": 1, - "CRITICAL": 2, - "UNKNOWN": 3, -} - - -def report(state: str, msg: str) -> None: - print(f"{state}: {msg}") - sys.exit(states[state]) - - -replicas = subprocess.check_output( - [ - "psql", - "-v", - "ON_ERROR_STOP=1", - "postgres", - "-t", - "-c", - "SELECT COUNT(*) FROM pg_stat_replication", - ], - stdin=subprocess.DEVNULL, - text=True, -).strip() -if int(replicas) > 0: - # We are the primary and we have replicas; we expect that backups - # will be taken on one of them. - report("OK", "this is the primary, with backups taken on the replicas") - -skip_backups = subprocess.run( - ["crudini", "--get", "/etc/zulip/zulip.conf", "postgresql", "skip_backups"], - capture_output=True, - text=True, - check=False, -) -if skip_backups.returncode == 0 and skip_backups.stdout.strip().lower() in [ - 1, - "y", - "t", - "yes", - "true", - "enable", - "enabled", -]: - report("OK", "backups are disabled on this host") - -try: - with open("/var/lib/nagios_state/last_postgresql_backup") as f: - last_backup = dateutil.parser.parse(f.read()) -except OSError: - report("UNKNOWN", "could not determine completion time of last PostgreSQL backup") - -if datetime.now(tz=timezone.utc) - last_backup > timedelta(hours=25): - report("CRITICAL", f"last PostgreSQL backup completed more than 25 hours ago: {last_backup}") - -report("OK", f"last PostgreSQL backup completed less than 25 hours ago: {last_backup}") diff --git a/puppet/zulip/files/postgresql/pg_backup_and_purge b/puppet/zulip/files/postgresql/pg_backup_and_purge index c3800bcfcc..b1d3a85d90 100755 --- a/puppet/zulip/files/postgresql/pg_backup_and_purge +++ b/puppet/zulip/files/postgresql/pg_backup_and_purge @@ -58,11 +58,6 @@ env = os.environ.copy() env["WALG_UPLOAD_DISK_CONCURRENCY"] = disk_concurrency subprocess.check_call(["env-wal-g", "backup-push", pg_data_path], env=env) -now = datetime.now(tz=timezone.utc) -with open("/var/lib/nagios_state/last_postgresql_backup", "w") as f: - f.write(now.isoformat()) - f.write("\n") - backups = {} lines = subprocess.check_output(["env-wal-g", "backup-list"], text=True).split("\n") for line in lines[1:]: @@ -70,7 +65,7 @@ for line in lines[1:]: backup_name, date_str = line.split()[0:2] backups[dateutil.parser.parse(date_str)] = backup_name -one_month_ago = now - timedelta(days=30) +one_month_ago = datetime.now(tz=timezone.utc) - timedelta(days=30) for date in sorted(backups.keys(), reverse=True): if date < one_month_ago: # We pass `FIND_FULL` such that if delta backups are being diff --git a/puppet/zulip/manifests/postgresql_backups.pp b/puppet/zulip/manifests/postgresql_backups.pp index 86b407cd5a..3e79923095 100644 --- a/puppet/zulip/manifests/postgresql_backups.pp +++ b/puppet/zulip/manifests/postgresql_backups.pp @@ -45,14 +45,4 @@ class zulip::postgresql_backups { mode => '0600', } } - - file { "${zulip::common::nagios_plugins_dir}/zulip_postgresql_backups": - require => Package[$zulip::common::nagios_plugins], - recurse => true, - purge => true, - owner => 'root', - group => 'root', - mode => '0755', - source => 'puppet:///modules/zulip/nagios_plugins/zulip_postgresql_backups', - } }