mirror of https://github.com/zulip/zulip.git
Add nagios alert for Postgres backups
(imported from commit 1ffe019b898751aea215dda1826113c1df5bee5c)
This commit is contained in:
parent
2bb6f45af2
commit
7409e81775
|
@ -165,3 +165,8 @@ define command {
|
|||
command_name check_pg_replication_lag
|
||||
command_line /usr/lib/nagios/plugins/check_pg_replication_lag
|
||||
}
|
||||
|
||||
define command {
|
||||
command_name check_postgres_backup
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_postgres_backup'
|
||||
}
|
||||
|
|
|
@ -207,6 +207,14 @@ define service {
|
|||
contact_groups admins
|
||||
}
|
||||
|
||||
define service {
|
||||
use generic-service
|
||||
service_description Check last Postgres backup time
|
||||
check_command check_postgres_backup
|
||||
hostgroup postgres
|
||||
contact_groups admins
|
||||
}
|
||||
|
||||
define service {
|
||||
use generic-service
|
||||
service_description process_user_activity bot
|
||||
|
|
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import dateutil.parser
|
||||
import pytz
|
||||
import subprocess
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
states = {
|
||||
"OK": 0,
|
||||
"WARNING": 1,
|
||||
"CRITICAL": 2,
|
||||
"UNKNOWN": 3
|
||||
}
|
||||
|
||||
def report(state, msg):
|
||||
print "%s: %s" % (state, msg)
|
||||
exit(states[state])
|
||||
|
||||
if subprocess.check_output(['psql', '-h', 'localhost', 'humbug', 'humbug', '-t', '-c',
|
||||
'SELECT pg_is_in_recovery()']).strip() != 'f':
|
||||
report('OK', 'this is not the primary')
|
||||
|
||||
try:
|
||||
with open('/var/lib/nagios_state/last_postgres_backup', 'r') as f:
|
||||
last_backup = dateutil.parser.parse(f.read())
|
||||
except IOError:
|
||||
report('UNKNOWN', 'could not determine completion time of last Postgres backup')
|
||||
|
||||
if datetime.now(tz=pytz.utc) - last_backup > timedelta(hours=25):
|
||||
report('CRITICAL', 'last Postgres backup completed more than 25 hours ago: %s' % (last_backup,))
|
||||
|
||||
report('OK', 'last Postgres backup completed less than 25 hours ago: %s' % (last_backup,))
|
|
@ -29,6 +29,11 @@ if run(['psql', '-t', '-c', 'select pg_is_in_recovery()']).strip() != 'f':
|
|||
|
||||
run(['env-wal-e', 'backup-push', '/var/lib/postgresql/9.1/main'])
|
||||
|
||||
now = datetime.now(tz=pytz.utc)
|
||||
with open('/var/lib/nagios_state/last_postgres_backup', 'w') as f:
|
||||
f.write(now.isoformat())
|
||||
f.write("\n")
|
||||
|
||||
backups = {}
|
||||
lines = run(['env-wal-e', 'backup-list']).split("\n")
|
||||
for line in lines[1:]:
|
||||
|
@ -36,7 +41,7 @@ for line in lines[1:]:
|
|||
backup_name, date, _, _ = line.split()
|
||||
backups[dateutil.parser.parse(date)] = backup_name
|
||||
|
||||
one_month_ago = datetime.now(tz=pytz.utc) - timedelta(days=30)
|
||||
one_month_ago = now - timedelta(days=30)
|
||||
for date in sorted(backups.keys(), reverse=True):
|
||||
if date < one_month_ago:
|
||||
run(['env-wal-e', 'delete', '--confirm', 'before', backups[date]])
|
||||
|
|
Loading…
Reference in New Issue