Add Nagios plugin for monitoring the Postgres replication lag

(imported from commit 0196cca357ba58b08ac74065ce84f0e49141d30d)
This commit is contained in:
Zev Benjamin 2013-08-22 17:01:12 -04:00
parent 3286fed238
commit 14f2b41c5d
3 changed files with 90 additions and 0 deletions

View File

@ -155,3 +155,8 @@ define command {
command_name check_fts_update_log
command_line /usr/lib/nagios/plugins/check_by_ssh -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_fts_update_log'
}
define command {
command_name check_pg_replication_lag
command_line /usr/lib/nagios/plugins/check_pg_replication_lag
}

View File

@ -43,3 +43,10 @@ define service{
# the second and third arguments, below.
check_command check_named_procs!autossh!9:9!9:15
}
define service{
use generic-service
host_name nagios
service_description Check postgres replication lag
check_command check_pg_replication_lag
}

View File

@ -0,0 +1,78 @@
#!/usr/bin/python
"""
Nagios plugin to check the difference between the primary and
secondary Postgres servers' xlog location.
"""
import subprocess
import re
states = {
"OK": 0,
"WARNING": 1,
"CRITICAL": 2,
"UNKNOWN": 3
}
def report(state, msg):
print "%s: %s" % (state, msg)
exit(states[state])
def get_loc_over_ssh(host, func):
return subprocess.check_output(['ssh', host, '-l', 'humbug',
'psql -t -c "SELECT %s()"' % (func,)])
def loc_to_abs_offset(loc_str):
m = re.match(r'^\s*([0-9a-fA-F]+)/([0-9a-fA-F]+)\s*$', loc_str)
if not m:
raise ValueError("Unknown xlog location format: " + loc_str)
(xlog_file, file_offset) = (m.group(1), m.group(2))
# From PostgreSQL 9.2's pg_xlog_location_diff:
# result = XLogFileSize * (xlogid1 - xlogid2) + xrecoff1 - xrecoff2
# Taking xlogid2 and xrecoff2 to be zero to get the absolute offset:
# result = XLogFileSize * xlogid1 + xrecoff1
#
# xlog_internal.h says:
# #define XLogSegSize ((uint32) XLOG_SEG_SIZE)
# #define XLogSegsPerFile (((uint32) 0xffffffff) / XLogSegSize)
# #define XLogFileSize (XLogSegsPerFile * XLogSegSize)
#
# Since XLOG_SEG_SIZE is normally 16MB, XLogFileSize comes out to 0xFF000000
return 0xFF000000 * int(xlog_file, 16) + int(file_offset, 16)
# Fetch the locations in this order to make the differences positive
# in the normal case given the delay in getting the values via ssh
secondary_replay_loc = get_loc_over_ssh('postgres-secondary.zulip.net', 'pg_last_xlog_replay_location')
secondary_recv_loc = get_loc_over_ssh('postgres-secondary.zulip.net', 'pg_last_xlog_receive_location')
primary_loc = get_loc_over_ssh('postgres-primary.zulip.net', 'pg_current_xlog_location')
primary_offset = loc_to_abs_offset(primary_loc)
secondary_recv_offset = loc_to_abs_offset(secondary_recv_loc)
secondary_replay_offset = loc_to_abs_offset(secondary_replay_loc)
recv_diff = primary_offset - secondary_recv_offset
replay_diff = secondary_recv_offset - secondary_replay_offset
# xlog segments are normally 16MB each. These thresholds are pretty arbitrary.
if recv_diff > 16 * 1024**2:
report('WARNING', 'secondary is %d bytes behind on receiving xlog' % (recv_diff,))
if recv_diff > 5 * 16 * 1024**2:
report('CRITICAL', 'secondary is %d bytes behind on receiving xlog' % (recv_diff,))
if replay_diff > 16 * 1024**2:
report('WARNING', 'secondary is %d bytes behind on applying received xlog' % (replay_diff))
if replay_diff > 5 * 16 * 1024**2:
report('CRITICAL', 'secondary is %d bytes behind on applying received xlog' % (replay_diff))
if recv_diff < 0:
report('CRITICAL', 'secondary is %d bytes ahead on receiving xlog' % (recv_diff,))
if replay_diff < 0:
report('CRITICAL', 'secondary is %d bytes ahead on applying received xlog' % (replay_diff,))
report('OK', ('secondary is %d bytes behind on receiving and %d bytes behind on applying xlog'
% (recv_diff, replay_diff)))