mirror of https://github.com/zulip/zulip.git
prometheus: Add an exporter for wal-g backup properties.
Since backups may now taken on arbitrary hosts, we need a blackbox monitor that _some_ backup was produced. Add a Prometheus exporter which calls `wal-g backup-list` and reports statistics about the backups. This could be extended to include `wal-g wal-verify`, but that requires a connection to the PostgreSQL server.
This commit is contained in:
parent
a22168d8b3
commit
3aba2789d3
|
@ -0,0 +1,153 @@
|
|||
#!/usr/bin/env python3
|
||||
import configparser
|
||||
import contextlib
|
||||
import json
|
||||
import logging
|
||||
import subprocess
|
||||
import sys
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta, timezone
|
||||
from http.server import BaseHTTPRequestHandler, HTTPServer
|
||||
from typing import Dict, List, Mapping, Optional, Protocol
|
||||
|
||||
|
||||
class GaugeMetric(Protocol):
|
||||
def __call__(self, value: float, labels: Optional[Mapping[str, str]] = None, /) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class WalGPrometheusServer(BaseHTTPRequestHandler):
|
||||
METRIC_PREFIX = "wal_g_backup_"
|
||||
|
||||
metrics: Dict[str, List[str]] = {}
|
||||
metric_values: Dict[str, Dict[str, str]] = defaultdict(dict)
|
||||
|
||||
server_version = "wal-g-prometheus-server/1.0"
|
||||
|
||||
def gauge(
|
||||
self, name: str, description: Optional[str] = None, default_value: Optional[float] = None
|
||||
) -> GaugeMetric:
|
||||
if name in self.metrics:
|
||||
raise ValueError(f"Redefinition of {name} metric")
|
||||
self.metrics[name] = [f"# TYPE {self.METRIC_PREFIX}{name} gauge"]
|
||||
if description is not None:
|
||||
self.metrics[name].append(f"# HELP {self.METRIC_PREFIX}{name} {description}")
|
||||
|
||||
def inner(value: float, labels: Optional[Mapping[str, str]] = None) -> None:
|
||||
label_str = ""
|
||||
if labels:
|
||||
label_str = "{" + ",".join([f'{k}="{v}"' for k, v in labels.items()]) + "}"
|
||||
self.metric_values[name][label_str] = f"{self.METRIC_PREFIX}{name}{label_str} {value}"
|
||||
|
||||
if default_value is not None:
|
||||
inner(default_value)
|
||||
return inner
|
||||
|
||||
def print_metrics(self) -> None:
|
||||
lines = []
|
||||
for metric_name in self.metrics:
|
||||
if self.metric_values[metric_name]:
|
||||
# Print preamble
|
||||
lines += self.metrics[metric_name]
|
||||
for metric_value in self.metric_values[metric_name].values():
|
||||
lines.append(metric_value)
|
||||
lines.append("")
|
||||
self.wfile.write("\n".join(lines).encode())
|
||||
|
||||
def do_GET(self) -> None:
|
||||
if self.path != "/metrics":
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
sys.stderr.flush()
|
||||
return
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header("Content-type", "text/plain; version=0.0.4")
|
||||
self.end_headers()
|
||||
|
||||
self.metrics = {}
|
||||
self.metric_values = defaultdict(dict)
|
||||
|
||||
backup_ok = self.gauge("ok", "If wal-g backup-list was OK", 0)
|
||||
backup_count = self.gauge("count", "Number of backups")
|
||||
backup_earliest_age_seconds = self.gauge("earliest_age_seconds", "Age of the oldest backup")
|
||||
backup_latest_age_seconds = self.gauge(
|
||||
"latest_age_seconds", "Age of the most recent backup"
|
||||
)
|
||||
backup_latest_duration_seconds = self.gauge(
|
||||
"latest_duration_seconds", "Duration the most recent backup took, in seconds"
|
||||
)
|
||||
backup_latest_compressed_size_bytes = self.gauge(
|
||||
"latest_compressed_size_bytes", "Size of the most recent backup, in bytes"
|
||||
)
|
||||
backup_latest_uncompressed_size_bytes = self.gauge(
|
||||
"latest_uncompressed_size_bytes",
|
||||
"Uncompressed size of the most recent backup, in bytes",
|
||||
)
|
||||
backup_total_compressed_size_bytes = self.gauge(
|
||||
"total_compressed_size_bytes", "Total compressed size of all backups, in bytes"
|
||||
)
|
||||
|
||||
now = datetime.now(tz=timezone.utc)
|
||||
try:
|
||||
config_file = configparser.RawConfigParser()
|
||||
config_file.read("/etc/zulip/zulip-secrets.conf")
|
||||
bucket = config_file["secrets"]["s3_backups_bucket"]
|
||||
|
||||
backup_list_output = subprocess.check_output(
|
||||
["env-wal-g", "backup-list", "--detail", "--json"],
|
||||
text=True,
|
||||
)
|
||||
data = json.loads(backup_list_output)
|
||||
backup_count(len(data), {"bucket": bucket})
|
||||
|
||||
backup_total_compressed_size_bytes(
|
||||
sum(e["compressed_size"] for e in data), {"bucket": bucket}
|
||||
)
|
||||
|
||||
if len(data) > 0:
|
||||
data.sort(key=lambda e: e["time"], reverse=True)
|
||||
latest = data[0]
|
||||
labels = {
|
||||
"host": latest["hostname"],
|
||||
"pg_version": str(latest["pg_version"]),
|
||||
"bucket": bucket,
|
||||
}
|
||||
backup_latest_compressed_size_bytes(latest["compressed_size"], labels)
|
||||
backup_latest_uncompressed_size_bytes(latest["uncompressed_size"], labels)
|
||||
|
||||
def t(key: str, e: Dict[str, str] = latest) -> datetime:
|
||||
return datetime.strptime(e[key], e["date_fmt"]).replace(tzinfo=timezone.utc)
|
||||
|
||||
backup_earliest_age_seconds(
|
||||
(now - t("start_time", data[-1])) / timedelta(seconds=1),
|
||||
{
|
||||
"host": data[-1]["hostname"],
|
||||
"pg_version": data[-1]["pg_version"],
|
||||
"bucket": bucket,
|
||||
},
|
||||
)
|
||||
backup_latest_age_seconds((now - t("start_time")) / timedelta(seconds=1), labels)
|
||||
backup_latest_duration_seconds(
|
||||
(t("finish_time") - t("start_time")) / timedelta(seconds=1), labels
|
||||
)
|
||||
backup_ok(1)
|
||||
except Exception as e:
|
||||
logging.exception(e)
|
||||
finally:
|
||||
self.print_metrics()
|
||||
self.log_message(
|
||||
"Served in %.2f seconds",
|
||||
(datetime.now(tz=timezone.utc) - now) / timedelta(seconds=1),
|
||||
)
|
||||
sys.stderr.flush()
|
||||
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s: %(message)s")
|
||||
server = HTTPServer(("127.0.0.1", 9188), WalGPrometheusServer)
|
||||
logging.info("Starting server...")
|
||||
with contextlib.suppress(KeyboardInterrupt):
|
||||
server.serve_forever()
|
||||
|
||||
server.server_close()
|
||||
logging.info("Stopping server...")
|
|
@ -6,6 +6,9 @@ class zulip_ops::profile::prometheus_server {
|
|||
include zulip_ops::profile::base
|
||||
include zulip_ops::prometheus::base
|
||||
|
||||
# This blackbox monitoring of the backup system runs locally
|
||||
include zulip_ops::prometheus::wal_g
|
||||
|
||||
$version = $zulip::common::versions['prometheus']['version']
|
||||
$dir = "/srv/zulip-prometheus-${version}"
|
||||
$bin = "${dir}/prometheus"
|
||||
|
|
|
@ -0,0 +1,34 @@
|
|||
# @summary Prometheus monitoring of wal-g backups
|
||||
#
|
||||
class zulip_ops::prometheus::wal_g {
|
||||
include zulip_ops::prometheus::base
|
||||
include zulip::supervisor
|
||||
include zulip::wal_g
|
||||
|
||||
file { '/usr/local/bin/wal-g-exporter':
|
||||
ensure => file,
|
||||
require => User[zulip],
|
||||
owner => 'zulip',
|
||||
group => 'zulip',
|
||||
mode => '0755',
|
||||
source => 'puppet:///modules/zulip/postgresql/wal-g-exporter',
|
||||
}
|
||||
|
||||
# We embed the hash of the contents into the name of the process, so
|
||||
# that `supervisorctl reread` knows that it has updated.
|
||||
$full_exporter_hash = sha256(file('zulip/postgresql/wal-g-exporter'))
|
||||
$exporter_hash = $full_exporter_hash[0,8]
|
||||
file { "${zulip::common::supervisor_conf_dir}/prometheus_wal_g_exporter.conf":
|
||||
ensure => file,
|
||||
require => [
|
||||
User[zulip],
|
||||
Package[supervisor],
|
||||
File['/usr/local/bin/wal-g-exporter'],
|
||||
],
|
||||
owner => 'root',
|
||||
group => 'root',
|
||||
mode => '0644',
|
||||
content => template('zulip_ops/supervisor/conf.d/prometheus_wal_g_exporter.conf.template.erb'),
|
||||
notify => Service[supervisor],
|
||||
}
|
||||
}
|
|
@ -0,0 +1,12 @@
|
|||
[program:prometheus_wal_g_exporter]
|
||||
# We record the hash of the script so that we can update this file
|
||||
# with it, which will make `supervisorctl reread && supervisorctl
|
||||
# update` restart this job.
|
||||
command=/usr/local/bin/wal-g-exporter
|
||||
process_name=wal-g-exporter_<%= @exporter_hash %>
|
||||
priority=10
|
||||
autostart=true
|
||||
autorestart=true
|
||||
user=zulip
|
||||
redirect_stderr=true
|
||||
stdout_logfile=/var/log/zulip/wal_g_exporter.log
|
Loading…
Reference in New Issue