Add nagios alert for Postgres backups

(imported from commit 1ffe019b898751aea215dda1826113c1df5bee5c)
This commit is contained in:
Zev Benjamin 2013-08-26 16:52:19 -04:00
parent 2bb6f45af2
commit 7409e81775
4 changed files with 51 additions and 1 deletions

View File

@ -165,3 +165,8 @@ define command {
command_name check_pg_replication_lag command_name check_pg_replication_lag
command_line /usr/lib/nagios/plugins/check_pg_replication_lag command_line /usr/lib/nagios/plugins/check_pg_replication_lag
} }
define command {
command_name check_postgres_backup
command_line /usr/lib/nagios/plugins/check_by_ssh -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_postgres_backup'
}

View File

@ -207,6 +207,14 @@ define service {
contact_groups admins contact_groups admins
} }
define service {
use generic-service
service_description Check last Postgres backup time
check_command check_postgres_backup
hostgroup postgres
contact_groups admins
}
define service { define service {
use generic-service use generic-service
service_description process_user_activity bot service_description process_user_activity bot

View File

@ -0,0 +1,32 @@
#!/usr/bin/python
import dateutil.parser
import pytz
import subprocess
from datetime import datetime, timedelta
states = {
"OK": 0,
"WARNING": 1,
"CRITICAL": 2,
"UNKNOWN": 3
}
def report(state, msg):
print "%s: %s" % (state, msg)
exit(states[state])
if subprocess.check_output(['psql', '-h', 'localhost', 'humbug', 'humbug', '-t', '-c',
'SELECT pg_is_in_recovery()']).strip() != 'f':
report('OK', 'this is not the primary')
try:
with open('/var/lib/nagios_state/last_postgres_backup', 'r') as f:
last_backup = dateutil.parser.parse(f.read())
except IOError:
report('UNKNOWN', 'could not determine completion time of last Postgres backup')
if datetime.now(tz=pytz.utc) - last_backup > timedelta(hours=25):
report('CRITICAL', 'last Postgres backup completed more than 25 hours ago: %s' % (last_backup,))
report('OK', 'last Postgres backup completed less than 25 hours ago: %s' % (last_backup,))

View File

@ -29,6 +29,11 @@ if run(['psql', '-t', '-c', 'select pg_is_in_recovery()']).strip() != 'f':
run(['env-wal-e', 'backup-push', '/var/lib/postgresql/9.1/main']) run(['env-wal-e', 'backup-push', '/var/lib/postgresql/9.1/main'])
now = datetime.now(tz=pytz.utc)
with open('/var/lib/nagios_state/last_postgres_backup', 'w') as f:
f.write(now.isoformat())
f.write("\n")
backups = {} backups = {}
lines = run(['env-wal-e', 'backup-list']).split("\n") lines = run(['env-wal-e', 'backup-list']).split("\n")
for line in lines[1:]: for line in lines[1:]:
@ -36,7 +41,7 @@ for line in lines[1:]:
backup_name, date, _, _ = line.split() backup_name, date, _, _ = line.split()
backups[dateutil.parser.parse(date)] = backup_name backups[dateutil.parser.parse(date)] = backup_name
one_month_ago = datetime.now(tz=pytz.utc) - timedelta(days=30) one_month_ago = now - timedelta(days=30)
for date in sorted(backups.keys(), reverse=True): for date in sorted(backups.keys(), reverse=True):
if date < one_month_ago: if date < one_month_ago:
run(['env-wal-e', 'delete', '--confirm', 'before', backups[date]]) run(['env-wal-e', 'delete', '--confirm', 'before', backups[date]])