mirror of https://github.com/zulip/zulip.git
Add Nagios plugin for monitoring the Postgres replication lag
(imported from commit 0196cca357ba58b08ac74065ce84f0e49141d30d)
This commit is contained in:
parent
3286fed238
commit
14f2b41c5d
|
@ -155,3 +155,8 @@ define command {
|
|||
command_name check_fts_update_log
|
||||
command_line /usr/lib/nagios/plugins/check_by_ssh -l humbug -t 30 -i /var/lib/nagios/.ssh/id_rsa -H $HOSTADDRESS$ -C '/usr/lib/nagios/plugins/check_fts_update_log'
|
||||
}
|
||||
|
||||
define command {
|
||||
command_name check_pg_replication_lag
|
||||
command_line /usr/lib/nagios/plugins/check_pg_replication_lag
|
||||
}
|
||||
|
|
|
@ -43,3 +43,10 @@ define service{
|
|||
# the second and third arguments, below.
|
||||
check_command check_named_procs!autossh!9:9!9:15
|
||||
}
|
||||
|
||||
define service{
|
||||
use generic-service
|
||||
host_name nagios
|
||||
service_description Check postgres replication lag
|
||||
check_command check_pg_replication_lag
|
||||
}
|
||||
|
|
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
"""
|
||||
Nagios plugin to check the difference between the primary and
|
||||
secondary Postgres servers' xlog location.
|
||||
"""
|
||||
|
||||
import subprocess
|
||||
import re
|
||||
|
||||
states = {
|
||||
"OK": 0,
|
||||
"WARNING": 1,
|
||||
"CRITICAL": 2,
|
||||
"UNKNOWN": 3
|
||||
}
|
||||
|
||||
def report(state, msg):
|
||||
print "%s: %s" % (state, msg)
|
||||
exit(states[state])
|
||||
|
||||
def get_loc_over_ssh(host, func):
|
||||
return subprocess.check_output(['ssh', host, '-l', 'humbug',
|
||||
'psql -t -c "SELECT %s()"' % (func,)])
|
||||
|
||||
def loc_to_abs_offset(loc_str):
|
||||
m = re.match(r'^\s*([0-9a-fA-F]+)/([0-9a-fA-F]+)\s*$', loc_str)
|
||||
if not m:
|
||||
raise ValueError("Unknown xlog location format: " + loc_str)
|
||||
(xlog_file, file_offset) = (m.group(1), m.group(2))
|
||||
|
||||
# From PostgreSQL 9.2's pg_xlog_location_diff:
|
||||
# result = XLogFileSize * (xlogid1 - xlogid2) + xrecoff1 - xrecoff2
|
||||
# Taking xlogid2 and xrecoff2 to be zero to get the absolute offset:
|
||||
# result = XLogFileSize * xlogid1 + xrecoff1
|
||||
#
|
||||
# xlog_internal.h says:
|
||||
# #define XLogSegSize ((uint32) XLOG_SEG_SIZE)
|
||||
# #define XLogSegsPerFile (((uint32) 0xffffffff) / XLogSegSize)
|
||||
# #define XLogFileSize (XLogSegsPerFile * XLogSegSize)
|
||||
#
|
||||
# Since XLOG_SEG_SIZE is normally 16MB, XLogFileSize comes out to 0xFF000000
|
||||
return 0xFF000000 * int(xlog_file, 16) + int(file_offset, 16)
|
||||
|
||||
# Fetch the locations in this order to make the differences positive
|
||||
# in the normal case given the delay in getting the values via ssh
|
||||
secondary_replay_loc = get_loc_over_ssh('postgres-secondary.zulip.net', 'pg_last_xlog_replay_location')
|
||||
secondary_recv_loc = get_loc_over_ssh('postgres-secondary.zulip.net', 'pg_last_xlog_receive_location')
|
||||
primary_loc = get_loc_over_ssh('postgres-primary.zulip.net', 'pg_current_xlog_location')
|
||||
|
||||
primary_offset = loc_to_abs_offset(primary_loc)
|
||||
secondary_recv_offset = loc_to_abs_offset(secondary_recv_loc)
|
||||
secondary_replay_offset = loc_to_abs_offset(secondary_replay_loc)
|
||||
|
||||
recv_diff = primary_offset - secondary_recv_offset
|
||||
replay_diff = secondary_recv_offset - secondary_replay_offset
|
||||
|
||||
# xlog segments are normally 16MB each. These thresholds are pretty arbitrary.
|
||||
if recv_diff > 16 * 1024**2:
|
||||
report('WARNING', 'secondary is %d bytes behind on receiving xlog' % (recv_diff,))
|
||||
|
||||
if recv_diff > 5 * 16 * 1024**2:
|
||||
report('CRITICAL', 'secondary is %d bytes behind on receiving xlog' % (recv_diff,))
|
||||
|
||||
if replay_diff > 16 * 1024**2:
|
||||
report('WARNING', 'secondary is %d bytes behind on applying received xlog' % (replay_diff))
|
||||
|
||||
if replay_diff > 5 * 16 * 1024**2:
|
||||
report('CRITICAL', 'secondary is %d bytes behind on applying received xlog' % (replay_diff))
|
||||
|
||||
if recv_diff < 0:
|
||||
report('CRITICAL', 'secondary is %d bytes ahead on receiving xlog' % (recv_diff,))
|
||||
|
||||
if replay_diff < 0:
|
||||
report('CRITICAL', 'secondary is %d bytes ahead on applying received xlog' % (replay_diff,))
|
||||
|
||||
report('OK', ('secondary is %d bytes behind on receiving and %d bytes behind on applying xlog'
|
||||
% (recv_diff, replay_diff)))
|
Loading…
Reference in New Issue