mirror of https://github.com/zulip/zulip.git
Add nagios alert for Postgres backups
(imported from commit 1ffe019b898751aea215dda1826113c1df5bee5c)
This commit is contained in:
parent
2bb6f45af2
commit
7409e81775
|
@ -165,3 +165,8 @@ define command {
|
||||||
command_name check_pg_replication_lag
|
command_name check_pg_replication_lag
|
||||||
command_line /usr/lib/nagios/plugins/check_pg_replication_lag
|
command_line /usr/lib/nagios/plugins/check_pg_replication_lag
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define command {
|
||||||
|
command_name check_postgres_backup
|
||||||
|
command_line /usr/lib/nagios/plugins/check_by_ssh -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_postgres_backup'
|
||||||
|
}
|
||||||
|
|
|
@ -207,6 +207,14 @@ define service {
|
||||||
contact_groups admins
|
contact_groups admins
|
||||||
}
|
}
|
||||||
|
|
||||||
|
define service {
|
||||||
|
use generic-service
|
||||||
|
service_description Check last Postgres backup time
|
||||||
|
check_command check_postgres_backup
|
||||||
|
hostgroup postgres
|
||||||
|
contact_groups admins
|
||||||
|
}
|
||||||
|
|
||||||
define service {
|
define service {
|
||||||
use generic-service
|
use generic-service
|
||||||
service_description process_user_activity bot
|
service_description process_user_activity bot
|
||||||
|
|
|
@ -0,0 +1,32 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import dateutil.parser
|
||||||
|
import pytz
|
||||||
|
import subprocess
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
states = {
|
||||||
|
"OK": 0,
|
||||||
|
"WARNING": 1,
|
||||||
|
"CRITICAL": 2,
|
||||||
|
"UNKNOWN": 3
|
||||||
|
}
|
||||||
|
|
||||||
|
def report(state, msg):
|
||||||
|
print "%s: %s" % (state, msg)
|
||||||
|
exit(states[state])
|
||||||
|
|
||||||
|
if subprocess.check_output(['psql', '-h', 'localhost', 'humbug', 'humbug', '-t', '-c',
|
||||||
|
'SELECT pg_is_in_recovery()']).strip() != 'f':
|
||||||
|
report('OK', 'this is not the primary')
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open('/var/lib/nagios_state/last_postgres_backup', 'r') as f:
|
||||||
|
last_backup = dateutil.parser.parse(f.read())
|
||||||
|
except IOError:
|
||||||
|
report('UNKNOWN', 'could not determine completion time of last Postgres backup')
|
||||||
|
|
||||||
|
if datetime.now(tz=pytz.utc) - last_backup > timedelta(hours=25):
|
||||||
|
report('CRITICAL', 'last Postgres backup completed more than 25 hours ago: %s' % (last_backup,))
|
||||||
|
|
||||||
|
report('OK', 'last Postgres backup completed less than 25 hours ago: %s' % (last_backup,))
|
|
@ -29,6 +29,11 @@ if run(['psql', '-t', '-c', 'select pg_is_in_recovery()']).strip() != 'f':
|
||||||
|
|
||||||
run(['env-wal-e', 'backup-push', '/var/lib/postgresql/9.1/main'])
|
run(['env-wal-e', 'backup-push', '/var/lib/postgresql/9.1/main'])
|
||||||
|
|
||||||
|
now = datetime.now(tz=pytz.utc)
|
||||||
|
with open('/var/lib/nagios_state/last_postgres_backup', 'w') as f:
|
||||||
|
f.write(now.isoformat())
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
backups = {}
|
backups = {}
|
||||||
lines = run(['env-wal-e', 'backup-list']).split("\n")
|
lines = run(['env-wal-e', 'backup-list']).split("\n")
|
||||||
for line in lines[1:]:
|
for line in lines[1:]:
|
||||||
|
@ -36,7 +41,7 @@ for line in lines[1:]:
|
||||||
backup_name, date, _, _ = line.split()
|
backup_name, date, _, _ = line.split()
|
||||||
backups[dateutil.parser.parse(date)] = backup_name
|
backups[dateutil.parser.parse(date)] = backup_name
|
||||||
|
|
||||||
one_month_ago = datetime.now(tz=pytz.utc) - timedelta(days=30)
|
one_month_ago = now - timedelta(days=30)
|
||||||
for date in sorted(backups.keys(), reverse=True):
|
for date in sorted(backups.keys(), reverse=True):
|
||||||
if date < one_month_ago:
|
if date < one_month_ago:
|
||||||
run(['env-wal-e', 'delete', '--confirm', 'before', backups[date]])
|
run(['env-wal-e', 'delete', '--confirm', 'before', backups[date]])
|
||||||
|
|
Loading…
Reference in New Issue