mirror of https://github.com/zulip/zulip.git
puppet: Remove check_postgresql_backup.
We have replaced this monitoring with the black-box wal-g monitoring, which is more accurate.
This commit is contained in:
parent
fef31614d3
commit
230040caa9
|
@ -245,10 +245,8 @@ To restore from a manual backup, the process is basically the reverse of the abo
|
|||
This restoration process can also be used to migrate a Zulip
|
||||
installation from one server to another.
|
||||
|
||||
We recommend running a disaster recovery test after setting up your
|
||||
backups to confirm that your backups are working. You may also want to
|
||||
monitor that they are up to date using the Nagios plugin at:
|
||||
`puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup`.
|
||||
We recommend running a disaster recovery test after setting up your backups to
|
||||
confirm that your backups are working.
|
||||
|
||||
## Data export
|
||||
|
||||
|
@ -528,9 +526,9 @@ it may be minutes before the backup is saved into S3 -- see
|
|||
If you need always-current backup availability, Zulip also has
|
||||
[built-in database replication support](postgresql.md#postgresql-warm-standby).
|
||||
|
||||
You can (and should) monitor that backups are running regularly via
|
||||
the Nagios plugin installed into
|
||||
`/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup`.
|
||||
You can (and should) monitor that backups are running regularly, for instance
|
||||
via the Prometheus exporter found in
|
||||
`puppet/zulip/files/postgresql/wal-g-exporter`
|
||||
|
||||
### Streaming backups to S3
|
||||
|
||||
|
|
|
@ -296,7 +296,6 @@ Database monitoring:
|
|||
- `check_fts_update_log`: Checks whether full-text search updates are
|
||||
being processed properly or getting backlogged.
|
||||
- `check_postgres`: General checks for database health.
|
||||
- `check_postgresql_backup`: Checks status of PostgreSQL backups.
|
||||
- `check_postgresql_replication_lag`: Checks whether PostgreSQL streaming
|
||||
replication is up to date.
|
||||
|
||||
|
|
|
@ -145,11 +145,6 @@ define command {
|
|||
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql/check_postgresql_replication_lag'
|
||||
}
|
||||
|
||||
define command {
|
||||
command_name check_postgresql_backup
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup'
|
||||
}
|
||||
|
||||
define command{
|
||||
command_name check_worker_memory
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_worker_memory'
|
||||
|
|
|
@ -273,14 +273,6 @@ define service {
|
|||
contact_groups admins
|
||||
}
|
||||
|
||||
define service {
|
||||
use generic-service
|
||||
service_description Check last PostgreSQL backup time
|
||||
hostgroup_name postgresql
|
||||
check_command check_postgresql_backup
|
||||
contact_groups admins
|
||||
}
|
||||
|
||||
|
||||
#### Redis
|
||||
|
||||
|
|
|
@ -1,66 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import dateutil.parser
|
||||
|
||||
states = {
|
||||
"OK": 0,
|
||||
"WARNING": 1,
|
||||
"CRITICAL": 2,
|
||||
"UNKNOWN": 3,
|
||||
}
|
||||
|
||||
|
||||
def report(state: str, msg: str) -> None:
|
||||
print(f"{state}: {msg}")
|
||||
sys.exit(states[state])
|
||||
|
||||
|
||||
replicas = subprocess.check_output(
|
||||
[
|
||||
"psql",
|
||||
"-v",
|
||||
"ON_ERROR_STOP=1",
|
||||
"postgres",
|
||||
"-t",
|
||||
"-c",
|
||||
"SELECT COUNT(*) FROM pg_stat_replication",
|
||||
],
|
||||
stdin=subprocess.DEVNULL,
|
||||
text=True,
|
||||
).strip()
|
||||
if int(replicas) > 0:
|
||||
# We are the primary and we have replicas; we expect that backups
|
||||
# will be taken on one of them.
|
||||
report("OK", "this is the primary, with backups taken on the replicas")
|
||||
|
||||
skip_backups = subprocess.run(
|
||||
["crudini", "--get", "/etc/zulip/zulip.conf", "postgresql", "skip_backups"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
check=False,
|
||||
)
|
||||
if skip_backups.returncode == 0 and skip_backups.stdout.strip().lower() in [
|
||||
1,
|
||||
"y",
|
||||
"t",
|
||||
"yes",
|
||||
"true",
|
||||
"enable",
|
||||
"enabled",
|
||||
]:
|
||||
report("OK", "backups are disabled on this host")
|
||||
|
||||
try:
|
||||
with open("/var/lib/nagios_state/last_postgresql_backup") as f:
|
||||
last_backup = dateutil.parser.parse(f.read())
|
||||
except OSError:
|
||||
report("UNKNOWN", "could not determine completion time of last PostgreSQL backup")
|
||||
|
||||
if datetime.now(tz=timezone.utc) - last_backup > timedelta(hours=25):
|
||||
report("CRITICAL", f"last PostgreSQL backup completed more than 25 hours ago: {last_backup}")
|
||||
|
||||
report("OK", f"last PostgreSQL backup completed less than 25 hours ago: {last_backup}")
|
|
@ -58,11 +58,6 @@ env = os.environ.copy()
|
|||
env["WALG_UPLOAD_DISK_CONCURRENCY"] = disk_concurrency
|
||||
subprocess.check_call(["env-wal-g", "backup-push", pg_data_path], env=env)
|
||||
|
||||
now = datetime.now(tz=timezone.utc)
|
||||
with open("/var/lib/nagios_state/last_postgresql_backup", "w") as f:
|
||||
f.write(now.isoformat())
|
||||
f.write("\n")
|
||||
|
||||
backups = {}
|
||||
lines = subprocess.check_output(["env-wal-g", "backup-list"], text=True).split("\n")
|
||||
for line in lines[1:]:
|
||||
|
@ -70,7 +65,7 @@ for line in lines[1:]:
|
|||
backup_name, date_str = line.split()[0:2]
|
||||
backups[dateutil.parser.parse(date_str)] = backup_name
|
||||
|
||||
one_month_ago = now - timedelta(days=30)
|
||||
one_month_ago = datetime.now(tz=timezone.utc) - timedelta(days=30)
|
||||
for date in sorted(backups.keys(), reverse=True):
|
||||
if date < one_month_ago:
|
||||
# We pass `FIND_FULL` such that if delta backups are being
|
||||
|
|
|
@ -45,14 +45,4 @@ class zulip::postgresql_backups {
|
|||
mode => '0600',
|
||||
}
|
||||
}
|
||||
|
||||
file { "${zulip::common::nagios_plugins_dir}/zulip_postgresql_backups":
|
||||
require => Package[$zulip::common::nagios_plugins],
|
||||
recurse => true,
|
||||
purge => true,
|
||||
owner => 'root',
|
||||
group => 'root',
|
||||
mode => '0755',
|
||||
source => 'puppet:///modules/zulip/nagios_plugins/zulip_postgresql_backups',
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue