mirror of https://github.com/zulip/zulip.git
puppet: Remove check_postgresql_backup.
We have replaced this monitoring with the black-box wal-g monitoring, which is more accurate.
This commit is contained in:
parent
fef31614d3
commit
230040caa9
|
@ -245,10 +245,8 @@ To restore from a manual backup, the process is basically the reverse of the abo
|
||||||
This restoration process can also be used to migrate a Zulip
|
This restoration process can also be used to migrate a Zulip
|
||||||
installation from one server to another.
|
installation from one server to another.
|
||||||
|
|
||||||
We recommend running a disaster recovery test after setting up your
|
We recommend running a disaster recovery test after setting up your backups to
|
||||||
backups to confirm that your backups are working. You may also want to
|
confirm that your backups are working.
|
||||||
monitor that they are up to date using the Nagios plugin at:
|
|
||||||
`puppet/zulip/files/nagios_plugins/zulip_postgresql_backups/check_postgresql_backup`.
|
|
||||||
|
|
||||||
## Data export
|
## Data export
|
||||||
|
|
||||||
|
@ -528,9 +526,9 @@ it may be minutes before the backup is saved into S3 -- see
|
||||||
If you need always-current backup availability, Zulip also has
|
If you need always-current backup availability, Zulip also has
|
||||||
[built-in database replication support](postgresql.md#postgresql-warm-standby).
|
[built-in database replication support](postgresql.md#postgresql-warm-standby).
|
||||||
|
|
||||||
You can (and should) monitor that backups are running regularly via
|
You can (and should) monitor that backups are running regularly, for instance
|
||||||
the Nagios plugin installed into
|
via the Prometheus exporter found in
|
||||||
`/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup`.
|
`puppet/zulip/files/postgresql/wal-g-exporter`
|
||||||
|
|
||||||
### Streaming backups to S3
|
### Streaming backups to S3
|
||||||
|
|
||||||
|
|
|
@ -296,7 +296,6 @@ Database monitoring:
|
||||||
- `check_fts_update_log`: Checks whether full-text search updates are
|
- `check_fts_update_log`: Checks whether full-text search updates are
|
||||||
being processed properly or getting backlogged.
|
being processed properly or getting backlogged.
|
||||||
- `check_postgres`: General checks for database health.
|
- `check_postgres`: General checks for database health.
|
||||||
- `check_postgresql_backup`: Checks status of PostgreSQL backups.
|
|
||||||
- `check_postgresql_replication_lag`: Checks whether PostgreSQL streaming
|
- `check_postgresql_replication_lag`: Checks whether PostgreSQL streaming
|
||||||
replication is up to date.
|
replication is up to date.
|
||||||
|
|
||||||
|
|
|
@ -145,11 +145,6 @@ define command {
|
||||||
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql/check_postgresql_replication_lag'
|
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql/check_postgresql_replication_lag'
|
||||||
}
|
}
|
||||||
|
|
||||||
define command {
|
|
||||||
command_name check_postgresql_backup
|
|
||||||
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_postgresql_backups/check_postgresql_backup'
|
|
||||||
}
|
|
||||||
|
|
||||||
define command{
|
define command{
|
||||||
command_name check_worker_memory
|
command_name check_worker_memory
|
||||||
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_worker_memory'
|
command_line /usr/lib/nagios/plugins/check_by_ssh -l nagios -t 30 -i /var/lib/nagios/.ssh/id_ed25519 -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/zulip_app_frontend/check_worker_memory'
|
||||||
|
|
|
@ -273,14 +273,6 @@ define service {
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
define service {
|
|
||||||
use generic-service
|
|
||||||
service_description Check last PostgreSQL backup time
|
|
||||||
hostgroup_name postgresql
|
|
||||||
check_command check_postgresql_backup
|
|
||||||
contact_groups admins
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
#### Redis
|
#### Redis
|
||||||
|
|
||||||
|
|
|
@ -1,66 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
from datetime import datetime, timedelta, timezone
|
|
||||||
|
|
||||||
import dateutil.parser
|
|
||||||
|
|
||||||
states = {
|
|
||||||
"OK": 0,
|
|
||||||
"WARNING": 1,
|
|
||||||
"CRITICAL": 2,
|
|
||||||
"UNKNOWN": 3,
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def report(state: str, msg: str) -> None:
|
|
||||||
print(f"{state}: {msg}")
|
|
||||||
sys.exit(states[state])
|
|
||||||
|
|
||||||
|
|
||||||
replicas = subprocess.check_output(
|
|
||||||
[
|
|
||||||
"psql",
|
|
||||||
"-v",
|
|
||||||
"ON_ERROR_STOP=1",
|
|
||||||
"postgres",
|
|
||||||
"-t",
|
|
||||||
"-c",
|
|
||||||
"SELECT COUNT(*) FROM pg_stat_replication",
|
|
||||||
],
|
|
||||||
stdin=subprocess.DEVNULL,
|
|
||||||
text=True,
|
|
||||||
).strip()
|
|
||||||
if int(replicas) > 0:
|
|
||||||
# We are the primary and we have replicas; we expect that backups
|
|
||||||
# will be taken on one of them.
|
|
||||||
report("OK", "this is the primary, with backups taken on the replicas")
|
|
||||||
|
|
||||||
skip_backups = subprocess.run(
|
|
||||||
["crudini", "--get", "/etc/zulip/zulip.conf", "postgresql", "skip_backups"],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
check=False,
|
|
||||||
)
|
|
||||||
if skip_backups.returncode == 0 and skip_backups.stdout.strip().lower() in [
|
|
||||||
1,
|
|
||||||
"y",
|
|
||||||
"t",
|
|
||||||
"yes",
|
|
||||||
"true",
|
|
||||||
"enable",
|
|
||||||
"enabled",
|
|
||||||
]:
|
|
||||||
report("OK", "backups are disabled on this host")
|
|
||||||
|
|
||||||
try:
|
|
||||||
with open("/var/lib/nagios_state/last_postgresql_backup") as f:
|
|
||||||
last_backup = dateutil.parser.parse(f.read())
|
|
||||||
except OSError:
|
|
||||||
report("UNKNOWN", "could not determine completion time of last PostgreSQL backup")
|
|
||||||
|
|
||||||
if datetime.now(tz=timezone.utc) - last_backup > timedelta(hours=25):
|
|
||||||
report("CRITICAL", f"last PostgreSQL backup completed more than 25 hours ago: {last_backup}")
|
|
||||||
|
|
||||||
report("OK", f"last PostgreSQL backup completed less than 25 hours ago: {last_backup}")
|
|
|
@ -58,11 +58,6 @@ env = os.environ.copy()
|
||||||
env["WALG_UPLOAD_DISK_CONCURRENCY"] = disk_concurrency
|
env["WALG_UPLOAD_DISK_CONCURRENCY"] = disk_concurrency
|
||||||
subprocess.check_call(["env-wal-g", "backup-push", pg_data_path], env=env)
|
subprocess.check_call(["env-wal-g", "backup-push", pg_data_path], env=env)
|
||||||
|
|
||||||
now = datetime.now(tz=timezone.utc)
|
|
||||||
with open("/var/lib/nagios_state/last_postgresql_backup", "w") as f:
|
|
||||||
f.write(now.isoformat())
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
backups = {}
|
backups = {}
|
||||||
lines = subprocess.check_output(["env-wal-g", "backup-list"], text=True).split("\n")
|
lines = subprocess.check_output(["env-wal-g", "backup-list"], text=True).split("\n")
|
||||||
for line in lines[1:]:
|
for line in lines[1:]:
|
||||||
|
@ -70,7 +65,7 @@ for line in lines[1:]:
|
||||||
backup_name, date_str = line.split()[0:2]
|
backup_name, date_str = line.split()[0:2]
|
||||||
backups[dateutil.parser.parse(date_str)] = backup_name
|
backups[dateutil.parser.parse(date_str)] = backup_name
|
||||||
|
|
||||||
one_month_ago = now - timedelta(days=30)
|
one_month_ago = datetime.now(tz=timezone.utc) - timedelta(days=30)
|
||||||
for date in sorted(backups.keys(), reverse=True):
|
for date in sorted(backups.keys(), reverse=True):
|
||||||
if date < one_month_ago:
|
if date < one_month_ago:
|
||||||
# We pass `FIND_FULL` such that if delta backups are being
|
# We pass `FIND_FULL` such that if delta backups are being
|
||||||
|
|
|
@ -45,14 +45,4 @@ class zulip::postgresql_backups {
|
||||||
mode => '0600',
|
mode => '0600',
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
file { "${zulip::common::nagios_plugins_dir}/zulip_postgresql_backups":
|
|
||||||
require => Package[$zulip::common::nagios_plugins],
|
|
||||||
recurse => true,
|
|
||||||
purge => true,
|
|
||||||
owner => 'root',
|
|
||||||
group => 'root',
|
|
||||||
mode => '0755',
|
|
||||||
source => 'puppet:///modules/zulip/nagios_plugins/zulip_postgresql_backups',
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue