mirror of https://github.com/zulip/zulip.git
prometheus: Add an exporter for wal-g backup properties.
Since backups may now taken on arbitrary hosts, we need a blackbox monitor that _some_ backup was produced. Add a Prometheus exporter which calls `wal-g backup-list` and reports statistics about the backups. This could be extended to include `wal-g wal-verify`, but that requires a connection to the PostgreSQL server.
This commit is contained in:
parent
a22168d8b3
commit
3aba2789d3
|
@ -0,0 +1,153 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
import configparser
|
||||||
|
import contextlib
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from collections import defaultdict
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||||
|
from typing import Dict, List, Mapping, Optional, Protocol
|
||||||
|
|
||||||
|
|
||||||
|
class GaugeMetric(Protocol):
|
||||||
|
def __call__(self, value: float, labels: Optional[Mapping[str, str]] = None, /) -> None:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class WalGPrometheusServer(BaseHTTPRequestHandler):
|
||||||
|
METRIC_PREFIX = "wal_g_backup_"
|
||||||
|
|
||||||
|
metrics: Dict[str, List[str]] = {}
|
||||||
|
metric_values: Dict[str, Dict[str, str]] = defaultdict(dict)
|
||||||
|
|
||||||
|
server_version = "wal-g-prometheus-server/1.0"
|
||||||
|
|
||||||
|
def gauge(
|
||||||
|
self, name: str, description: Optional[str] = None, default_value: Optional[float] = None
|
||||||
|
) -> GaugeMetric:
|
||||||
|
if name in self.metrics:
|
||||||
|
raise ValueError(f"Redefinition of {name} metric")
|
||||||
|
self.metrics[name] = [f"# TYPE {self.METRIC_PREFIX}{name} gauge"]
|
||||||
|
if description is not None:
|
||||||
|
self.metrics[name].append(f"# HELP {self.METRIC_PREFIX}{name} {description}")
|
||||||
|
|
||||||
|
def inner(value: float, labels: Optional[Mapping[str, str]] = None) -> None:
|
||||||
|
label_str = ""
|
||||||
|
if labels:
|
||||||
|
label_str = "{" + ",".join([f'{k}="{v}"' for k, v in labels.items()]) + "}"
|
||||||
|
self.metric_values[name][label_str] = f"{self.METRIC_PREFIX}{name}{label_str} {value}"
|
||||||
|
|
||||||
|
if default_value is not None:
|
||||||
|
inner(default_value)
|
||||||
|
return inner
|
||||||
|
|
||||||
|
def print_metrics(self) -> None:
|
||||||
|
lines = []
|
||||||
|
for metric_name in self.metrics:
|
||||||
|
if self.metric_values[metric_name]:
|
||||||
|
# Print preamble
|
||||||
|
lines += self.metrics[metric_name]
|
||||||
|
for metric_value in self.metric_values[metric_name].values():
|
||||||
|
lines.append(metric_value)
|
||||||
|
lines.append("")
|
||||||
|
self.wfile.write("\n".join(lines).encode())
|
||||||
|
|
||||||
|
def do_GET(self) -> None:
|
||||||
|
if self.path != "/metrics":
|
||||||
|
self.send_response(404)
|
||||||
|
self.end_headers()
|
||||||
|
sys.stderr.flush()
|
||||||
|
return
|
||||||
|
|
||||||
|
self.send_response(200)
|
||||||
|
self.send_header("Content-type", "text/plain; version=0.0.4")
|
||||||
|
self.end_headers()
|
||||||
|
|
||||||
|
self.metrics = {}
|
||||||
|
self.metric_values = defaultdict(dict)
|
||||||
|
|
||||||
|
backup_ok = self.gauge("ok", "If wal-g backup-list was OK", 0)
|
||||||
|
backup_count = self.gauge("count", "Number of backups")
|
||||||
|
backup_earliest_age_seconds = self.gauge("earliest_age_seconds", "Age of the oldest backup")
|
||||||
|
backup_latest_age_seconds = self.gauge(
|
||||||
|
"latest_age_seconds", "Age of the most recent backup"
|
||||||
|
)
|
||||||
|
backup_latest_duration_seconds = self.gauge(
|
||||||
|
"latest_duration_seconds", "Duration the most recent backup took, in seconds"
|
||||||
|
)
|
||||||
|
backup_latest_compressed_size_bytes = self.gauge(
|
||||||
|
"latest_compressed_size_bytes", "Size of the most recent backup, in bytes"
|
||||||
|
)
|
||||||
|
backup_latest_uncompressed_size_bytes = self.gauge(
|
||||||
|
"latest_uncompressed_size_bytes",
|
||||||
|
"Uncompressed size of the most recent backup, in bytes",
|
||||||
|
)
|
||||||
|
backup_total_compressed_size_bytes = self.gauge(
|
||||||
|
"total_compressed_size_bytes", "Total compressed size of all backups, in bytes"
|
||||||
|
)
|
||||||
|
|
||||||
|
now = datetime.now(tz=timezone.utc)
|
||||||
|
try:
|
||||||
|
config_file = configparser.RawConfigParser()
|
||||||
|
config_file.read("/etc/zulip/zulip-secrets.conf")
|
||||||
|
bucket = config_file["secrets"]["s3_backups_bucket"]
|
||||||
|
|
||||||
|
backup_list_output = subprocess.check_output(
|
||||||
|
["env-wal-g", "backup-list", "--detail", "--json"],
|
||||||
|
text=True,
|
||||||
|
)
|
||||||
|
data = json.loads(backup_list_output)
|
||||||
|
backup_count(len(data), {"bucket": bucket})
|
||||||
|
|
||||||
|
backup_total_compressed_size_bytes(
|
||||||
|
sum(e["compressed_size"] for e in data), {"bucket": bucket}
|
||||||
|
)
|
||||||
|
|
||||||
|
if len(data) > 0:
|
||||||
|
data.sort(key=lambda e: e["time"], reverse=True)
|
||||||
|
latest = data[0]
|
||||||
|
labels = {
|
||||||
|
"host": latest["hostname"],
|
||||||
|
"pg_version": str(latest["pg_version"]),
|
||||||
|
"bucket": bucket,
|
||||||
|
}
|
||||||
|
backup_latest_compressed_size_bytes(latest["compressed_size"], labels)
|
||||||
|
backup_latest_uncompressed_size_bytes(latest["uncompressed_size"], labels)
|
||||||
|
|
||||||
|
def t(key: str, e: Dict[str, str] = latest) -> datetime:
|
||||||
|
return datetime.strptime(e[key], e["date_fmt"]).replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
backup_earliest_age_seconds(
|
||||||
|
(now - t("start_time", data[-1])) / timedelta(seconds=1),
|
||||||
|
{
|
||||||
|
"host": data[-1]["hostname"],
|
||||||
|
"pg_version": data[-1]["pg_version"],
|
||||||
|
"bucket": bucket,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
backup_latest_age_seconds((now - t("start_time")) / timedelta(seconds=1), labels)
|
||||||
|
backup_latest_duration_seconds(
|
||||||
|
(t("finish_time") - t("start_time")) / timedelta(seconds=1), labels
|
||||||
|
)
|
||||||
|
backup_ok(1)
|
||||||
|
except Exception as e:
|
||||||
|
logging.exception(e)
|
||||||
|
finally:
|
||||||
|
self.print_metrics()
|
||||||
|
self.log_message(
|
||||||
|
"Served in %.2f seconds",
|
||||||
|
(datetime.now(tz=timezone.utc) - now) / timedelta(seconds=1),
|
||||||
|
)
|
||||||
|
sys.stderr.flush()
|
||||||
|
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
|
||||||
|
server = HTTPServer(("127.0.0.1", 9188), WalGPrometheusServer)
|
||||||
|
logging.info("Starting server...")
|
||||||
|
with contextlib.suppress(KeyboardInterrupt):
|
||||||
|
server.serve_forever()
|
||||||
|
|
||||||
|
server.server_close()
|
||||||
|
logging.info("Stopping server...")
|
|
@ -6,6 +6,9 @@ class zulip_ops::profile::prometheus_server {
|
||||||
include zulip_ops::profile::base
|
include zulip_ops::profile::base
|
||||||
include zulip_ops::prometheus::base
|
include zulip_ops::prometheus::base
|
||||||
|
|
||||||
|
# This blackbox monitoring of the backup system runs locally
|
||||||
|
include zulip_ops::prometheus::wal_g
|
||||||
|
|
||||||
$version = $zulip::common::versions['prometheus']['version']
|
$version = $zulip::common::versions['prometheus']['version']
|
||||||
$dir = "/srv/zulip-prometheus-${version}"
|
$dir = "/srv/zulip-prometheus-${version}"
|
||||||
$bin = "${dir}/prometheus"
|
$bin = "${dir}/prometheus"
|
||||||
|
|
|
@ -0,0 +1,34 @@
|
||||||
|
# @summary Prometheus monitoring of wal-g backups
|
||||||
|
#
|
||||||
|
class zulip_ops::prometheus::wal_g {
|
||||||
|
include zulip_ops::prometheus::base
|
||||||
|
include zulip::supervisor
|
||||||
|
include zulip::wal_g
|
||||||
|
|
||||||
|
file { '/usr/local/bin/wal-g-exporter':
|
||||||
|
ensure => file,
|
||||||
|
require => User[zulip],
|
||||||
|
owner => 'zulip',
|
||||||
|
group => 'zulip',
|
||||||
|
mode => '0755',
|
||||||
|
source => 'puppet:///modules/zulip/postgresql/wal-g-exporter',
|
||||||
|
}
|
||||||
|
|
||||||
|
# We embed the hash of the contents into the name of the process, so
|
||||||
|
# that `supervisorctl reread` knows that it has updated.
|
||||||
|
$full_exporter_hash = sha256(file('zulip/postgresql/wal-g-exporter'))
|
||||||
|
$exporter_hash = $full_exporter_hash[0,8]
|
||||||
|
file { "${zulip::common::supervisor_conf_dir}/prometheus_wal_g_exporter.conf":
|
||||||
|
ensure => file,
|
||||||
|
require => [
|
||||||
|
User[zulip],
|
||||||
|
Package[supervisor],
|
||||||
|
File['/usr/local/bin/wal-g-exporter'],
|
||||||
|
],
|
||||||
|
owner => 'root',
|
||||||
|
group => 'root',
|
||||||
|
mode => '0644',
|
||||||
|
content => template('zulip_ops/supervisor/conf.d/prometheus_wal_g_exporter.conf.template.erb'),
|
||||||
|
notify => Service[supervisor],
|
||||||
|
}
|
||||||
|
}
|
|
@ -0,0 +1,12 @@
|
||||||
|
[program:prometheus_wal_g_exporter]
|
||||||
|
# We record the hash of the script so that we can update this file
|
||||||
|
# with it, which will make `supervisorctl reread && supervisorctl
|
||||||
|
# update` restart this job.
|
||||||
|
command=/usr/local/bin/wal-g-exporter
|
||||||
|
process_name=wal-g-exporter_<%= @exporter_hash %>
|
||||||
|
priority=10
|
||||||
|
autostart=true
|
||||||
|
autorestart=true
|
||||||
|
user=zulip
|
||||||
|
redirect_stderr=true
|
||||||
|
stdout_logfile=/var/log/zulip/wal_g_exporter.log
|
Loading…
Reference in New Issue