integration: Update topic & content of grafana webhook.

Sending to a topic based on the number of firing alerts makes no
sense, and leads to conversations and alerts scattered randomly across
topics based on how on fire the alerting is.

Send a separate message for each alert in the Grafana webhook payload,
with the alert's name as its topic; if no alert name can be found,
fall back to the alert's fingerprint.  Also include all alert values
in the body of the message, along with links to the alert generator,
silence, and image, if available.

Co-authored-by: Alex Vandiver <alexmv@zulip.com>
This commit is contained in:
Ryan Crisanti 2024-02-29 13:10:12 -05:00 committed by Tim Abbott
parent 18067794ae
commit fc2aac6baa
5 changed files with 285 additions and 46 deletions

View File

@ -758,7 +758,7 @@ DOC_SCREENSHOT_CONFIG: Dict[str, List[BaseScreenshotConfig]] = {
"gocd": [ScreenshotConfig("pipeline.json")],
"gogs": [ScreenshotConfig("pull_request__opened.json")],
"gosquared": [ScreenshotConfig("traffic_spike.json", image_name="000.png")],
"grafana": [ScreenshotConfig("alert_v7.json")],
"grafana": [ScreenshotConfig("alert_values_v11.json")],
"greenhouse": [ScreenshotConfig("candidate_stage_change.json", image_name="000.png")],
"groove": [ScreenshotConfig("ticket_started.json")],
"harbor": [ScreenshotConfig("scanning_completed.json")],

View File

@ -0,0 +1,51 @@
{
"receiver": "Debug webhook",
"status": "firing",
"alerts": [
{
"status": "firing",
"labels": {
"debug": "true",
"grafana_folder": "device"
},
"annotations": {
"summary": "High memory usage"
},
"startsAt": "2024-03-01T02:09:00Z",
"endsAt": "0001-01-01T00:00:00Z",
"generatorURL": "https://play.grafana.org/alerting/grafana/dd2f0260-3cfc-4c65-a4c4-f3f632c551f4/view?orgId=1",
"fingerprint": "e6349a25f5ef0e9e",
"silenceURL": "https://play.grafana.org/alerting/silence/new?alertmanager=grafana\u0026matcher=alertname%3DMemory+%28copy%29\u0026matcher=debug%3Dtrue\u0026matcher=grafana_folder%3Ddevice\u0026orgId=1",
"dashboardURL": "https://play.grafana.org/d/ece9fb32-7f71-4be9-bd94-2f23608ae5b9?orgId=1",
"panelURL": "https://play.grafana.org/d/ece9fb32-7f71-4be9-bd94-2f23608ae5b9?orgId=1\u0026viewPanel=2",
"values": {
"A": 2473545728,
"B": 0,
"C": 1,
"minute": 9
},
"valueString": "[ var='A' labels={instance=node_exporter:9100, job=node} value=2.473545728e+09 ], [ var='B' labels={instance=node_exporter:9100, job=node} value=0 ], [ var='C' labels={} value=1 ], [ var='minute' labels={} value=9 ]",
"imageURL": "https://grafana.com/assets/img/blog/mixed_styles.png"
}
],
"groupLabels": {
"alertname": "Memory (copy)",
"grafana_folder": "device"
},
"commonLabels": {
"alertname": "Memory (copy)",
"debug": "true",
"grafana_folder": "device"
},
"commonAnnotations": {
"summary": "High memory usage"
},
"externalURL": "https://play.grafana.org/",
"version": "1",
"groupKey": "{}/{debug=\"true\"}:{alertname=\"Memory (copy)\", grafana_folder=\"device\"}",
"truncatedAlerts": 0,
"orgId": 1,
"title": "[FIRING:1] Memory (copy) device (true)",
"state": "alerting",
"message": "**Firing**\n\nValue: A=2.473545728e+09, B=0, C=1, minute=9\nLabels:\n - alertname = Memory (copy)\n - debug = true\n - grafana_folder = device\nAnnotations:\n - summary = High memory usage\nSource: https://play.grafana.org/alerting/grafana/dd2f0260-3cfc-4c65-a4c4-f3f632c551f4/view?orgId=1\nSilence: https://play.grafana.org/alerting/silence/new?alertmanager=grafana\u0026matcher=alertname%3DMemory+%28copy%29\u0026matcher=debug%3Dtrue\u0026matcher=grafana_folder%3Ddevice\u0026orgId=1\nDashboard: https://play.grafana.org/d/ece9fb32-7f71-4be9-bd94-2f23608ae5b9?orgId=1\nPanel: https://play.grafana.org/d/ece9fb32-7f71-4be9-bd94-2f23608ae5b9?orgId=1\u0026viewPanel=2\n"
}

View File

@ -0,0 +1,52 @@
{
"receiver": "Debug webhook",
"status": "firing",
"alerts": [
{
"status": "firing",
"labels": {
"alertname": "Memory (copy)",
"debug": "true",
"grafana_folder": "device"
},
"annotations": {
"summary": "High memory usage"
},
"startsAt": "2024-03-01T02:09:00Z",
"endsAt": "0001-01-01T00:00:00Z",
"generatorURL": "https://play.grafana.org/alerting/grafana/dd2f0260-3cfc-4c65-a4c4-f3f632c551f4/view?orgId=1",
"fingerprint": "e6349a25f5ef0e9e",
"silenceURL": "https://play.grafana.org/alerting/silence/new?alertmanager=grafana\u0026matcher=alertname%3DMemory+%28copy%29\u0026matcher=debug%3Dtrue\u0026matcher=grafana_folder%3Ddevice\u0026orgId=1",
"dashboardURL": "https://play.grafana.org/d/ece9fb32-7f71-4be9-bd94-2f23608ae5b9?orgId=1",
"panelURL": "https://play.grafana.org/d/ece9fb32-7f71-4be9-bd94-2f23608ae5b9?orgId=1\u0026viewPanel=2",
"values": {
"A": 2473545728,
"B": 0,
"C": 1,
"minute": 9
},
"valueString": "[ var='A' labels={instance=node_exporter:9100, job=node} value=2.473545728e+09 ], [ var='B' labels={instance=node_exporter:9100, job=node} value=0 ], [ var='C' labels={} value=1 ], [ var='minute' labels={} value=9 ]",
"imageURL": "https://grafana.com/assets/img/blog/mixed_styles.png"
}
],
"groupLabels": {
"alertname": "Memory (copy)",
"grafana_folder": "device"
},
"commonLabels": {
"alertname": "Memory (copy)",
"debug": "true",
"grafana_folder": "device"
},
"commonAnnotations": {
"summary": "High memory usage"
},
"externalURL": "https://play.grafana.org/",
"version": "1",
"groupKey": "{}/{debug=\"true\"}:{alertname=\"Memory (copy)\", grafana_folder=\"device\"}",
"truncatedAlerts": 0,
"orgId": 1,
"title": "[FIRING:1] Memory (copy) device (true)",
"state": "alerting",
"message": "**Firing**\n\nValue: A=2.473545728e+09, B=0, C=1, minute=9\nLabels:\n - alertname = Memory (copy)\n - debug = true\n - grafana_folder = device\nAnnotations:\n - summary = High memory usage\nSource: https://play.grafana.org/alerting/grafana/dd2f0260-3cfc-4c65-a4c4-f3f632c551f4/view?orgId=1\nSilence: https://play.grafana.org/alerting/silence/new?alertmanager=grafana\u0026matcher=alertname%3DMemory+%28copy%29\u0026matcher=debug%3Dtrue\u0026matcher=grafana_folder%3Ddevice\u0026orgId=1\nDashboard: https://play.grafana.org/d/ece9fb32-7f71-4be9-bd94-2f23608ae5b9?orgId=1\nPanel: https://play.grafana.org/d/ece9fb32-7f71-4be9-bd94-2f23608ae5b9?orgId=1\u0026viewPanel=2\n"
}

View File

@ -128,14 +128,11 @@ Someone is testing the alert notification within grafana.
)
def test_alert_v8(self) -> None:
expected_topic_name = "[RESOLVED:1]"
expected_topic_name = "[TestAlert]"
expected_message = """
:checkbox: **RESOLVED**
Webhook test message.
---
**Alert 1**: TestAlert.
**TestAlert**
This alert was fired at <time:2022-08-31T05:54:04.52289368Z>.
@ -145,10 +142,13 @@ Labels:
- alertname: TestAlert
- instance: Grafana
Values:
[ metric='foo' labels={instance=bar} value=10 ]
Annotations:
- summary: Notification test
1 alert(s) truncated.
[Silence](https://zuliptestingwh2.grafana.net/alerting/silence/new?alertmanager=grafana&matcher=alertname%3DTestAlert&matcher=instance%3DGrafana)
""".strip()
self.check_webhook(
@ -159,44 +159,143 @@ Annotations:
)
def test_alert_multiple_v8(self) -> None:
expected_topic_name = "[FIRING:2]"
expected_message = """
expected_topic_name_1 = "[High memory usage]"
expected_topic_name_2 = "[High CPU usage]"
expected_message_1 = """
:alert: **FIRING**
Webhook test message.
---
**Alert 1**: High memory usage.
**High memory usage**
This alert was fired at <time:2021-10-12T09:51:03.157076+02:00>.
Labels:
- alertname: High memory usage
- team: blue
- zone: us-1
Values:
[ metric='' labels={} value=14151.331895396988 ]
Annotations:
- description: The system has high memory usage
- runbook_url: https://myrunbook.com/runbook/1234
- summary: This alert was triggered for zone us-1
[Generator](https://play.grafana.org/alerting/1afz29v7z/edit)
[Silence](https://play.grafana.org/alerting/silence/new?alertmanager=grafana&matchers=alertname%3DT2%2Cteam%3Dblue%2Czone%3Dus-1)
""".strip()
expected_message_2 = """
:alert: **FIRING**
---
**Alert 2**: High CPU usage.
**High CPU usage**
This alert was fired at <time:2021-10-12T09:56:03.157076+02:00>.
Labels:
- alertname: High CPU usage
- team: blue
- zone: eu-1
Values:
[ metric='' labels={} value=47043.702386305304 ]
Annotations:
- description: The system has high CPU usage
- runbook_url: https://myrunbook.com/runbook/1234
- summary: This alert was triggered for zone eu-1
[Generator](https://play.grafana.org/alerting/d1rdpdv7k/edit)
[Silence](https://play.grafana.org/alerting/silence/new?alertmanager=grafana&matchers=alertname%3DT1%2Cteam%3Dblue%2Czone%3Deu-1)
""".strip()
self.subscribe(self.test_user, self.CHANNEL_NAME)
payload = self.get_body("alert_multiple_v8")
msg = self.send_webhook_payload(
self.test_user,
self.url,
payload,
content_type="application/json",
)
msg = self.get_second_to_last_message()
self.assert_channel_message(
message=msg,
channel_name=self.CHANNEL_NAME,
topic_name=expected_topic_name_1,
content=expected_message_1,
)
msg = self.get_last_message()
self.assert_channel_message(
message=msg,
channel_name=self.CHANNEL_NAME,
topic_name=expected_topic_name_2,
content=expected_message_2,
)
def test_alert_values_v11(self) -> None:
expected_topic_name = "[Memory (copy)]" # alertname
expected_message = """
:alert: **FIRING**
**Memory (copy)**
This alert was fired at <time:2024-03-01T02:09:00Z>.
Labels:
- alertname: Memory (copy)
- debug: true
- grafana_folder: device
Values:
- A: 2473545728
- B: 0
- C: 1
- minute: 9
Annotations:
- summary: High memory usage
[Generator](https://play.grafana.org/alerting/grafana/dd2f0260-3cfc-4c65-a4c4-f3f632c551f4/view?orgId=1)
[Silence](https://play.grafana.org/alerting/silence/new?alertmanager=grafana\u0026matcher=alertname%3DMemory+%28copy%29\u0026matcher=debug%3Dtrue\u0026matcher=grafana_folder%3Ddevice\u0026orgId=1)
[Image](https://grafana.com/assets/img/blog/mixed_styles.png)
""".strip()
self.check_webhook(
"alert_multiple_v8",
"alert_values_v11",
expected_topic_name,
expected_message,
content_type="application/x-www-form-urlencoded",
)
def test_alert_no_alertname_v11(self) -> None:
expected_topic_name = "[e6349a25f5ef0e9e]" # fingerprint
expected_message = """
:alert: **FIRING**
This alert was fired at <time:2024-03-01T02:09:00Z>.
Labels:
- debug: true
- grafana_folder: device
Values:
- A: 2473545728
- B: 0
- C: 1
- minute: 9
Annotations:
- summary: High memory usage
[Generator](https://play.grafana.org/alerting/grafana/dd2f0260-3cfc-4c65-a4c4-f3f632c551f4/view?orgId=1)
[Silence](https://play.grafana.org/alerting/silence/new?alertmanager=grafana\u0026matcher=alertname%3DMemory+%28copy%29\u0026matcher=debug%3Dtrue\u0026matcher=grafana_folder%3Ddevice\u0026orgId=1)
[Image](https://grafana.com/assets/img/blog/mixed_styles.png)
""".strip()
self.check_webhook(
"alert_no_alertname_v11",
expected_topic_name,
expected_message,
content_type="application/x-www-form-urlencoded",

View File

@ -5,6 +5,7 @@ from zerver.lib.response import json_success
from zerver.lib.typed_endpoint import JsonBodyPayload, typed_endpoint
from zerver.lib.validator import (
WildValue,
check_anything,
check_float,
check_int,
check_none_or,
@ -21,20 +22,23 @@ ALERT_STATUS_TEMPLATE = "{alert_icon} **{alert_state}**\n\n"
OLD_MESSAGE_TEMPLATE = "{alert_status}[{rule_name}]({rule_url})\n\n{alert_message}{eval_matches}"
NEW_TOPIC_TEMPLATE = "[{alert_status}:{alert_count}]"
NEW_TOPIC_TEMPLATE = "[{alertname}]"
ALERT_HEADER_TEMPLATE = """\n---
**Alert {count}**"""
START_TIME_TEMPLATE = "This alert was fired at <time:{start_time}>."
START_TIME_TEMPLATE = "\n\nThis alert was fired at <time:{start_time}>.\n"
END_TIME_TEMPLATE = "\n\nThis alert was resolved at <time:{end_time}>."
END_TIME_TEMPLATE = "\nThis alert was resolved at <time:{end_time}>.\n\n"
MESSAGE_LABELS_TEMPLATE = "\n\nLabels:\n{label_information}\n"
MESSAGE_LABELS_TEMPLATE = "Labels:\n{label_information}\n"
MESSAGE_VALUES_TEMPLATE = "Values:\n{value_information}\n"
MESSAGE_ANNOTATIONS_TEMPLATE = "Annotations:\n{annotation_information}\n"
MESSAGE_ANNOTATIONS_TEMPLATE = "Annotations:\n{annotation_information}"
TRUNCATED_ALERTS_TEMPLATE = "{count} alert(s) truncated.\n"
MESSAGE_GENERATOR_TEMPLATE = "\n[Generator]({generator_url})"
MESSAGE_SILENCE_TEMPLATE = "\n[Silence]({silence_url})"
MESSAGE_IMAGE_TEMPLATE = "\n[Image]({image_url})"
LEGACY_EVENT_TYPES = ["ok", "pending", "alerting", "paused"]
@ -53,24 +57,31 @@ def api_grafana_webhook(
) -> HttpResponse:
# Grafana alerting system.
if "alerts" in payload:
status = payload["status"].tame(check_string_in(["firing", "resolved"]))
alert_count = len(payload["alerts"])
topic_name = NEW_TOPIC_TEMPLATE.format(alert_status=status.upper(), alert_count=alert_count)
if status == "firing":
body = ALERT_STATUS_TEMPLATE.format(alert_icon=":alert:", alert_state=status.upper())
else:
body = ALERT_STATUS_TEMPLATE.format(alert_icon=":checkbox:", alert_state=status.upper())
if payload["message"]:
body += payload["message"].tame(check_string) + "\n"
for index, alert in enumerate(payload["alerts"], 1):
body += ALERT_HEADER_TEMPLATE.format(count=index)
# Grafana 8.0 and above alerting; works for:
# - https://grafana.com/docs/grafana/v8.0/alerting/unified-alerting/message-templating/template-data/
# - https://grafana.com/docs/grafana/v9.0/alerting/contact-points/notifiers/webhook-notifier/
# - https://grafana.com/docs/grafana/v10.0/alerting/alerting-rules/manage-contact-points/webhook-notifier/
# - https://grafana.com/docs/grafana/v11.0/alerting/configure-notifications/manage-contact-points/integrations/webhook-notifier/
for alert in payload["alerts"]:
status = alert["status"].tame(check_string_in(["firing", "resolved"]))
if status == "firing":
body = ALERT_STATUS_TEMPLATE.format(
alert_icon=":alert:", alert_state=status.upper()
)
else:
body = ALERT_STATUS_TEMPLATE.format(
alert_icon=":checkbox:", alert_state=status.upper()
)
if "alertname" in alert["labels"] and alert["labels"]["alertname"]:
body += ": " + alert["labels"]["alertname"].tame(check_string) + "."
alertname = alert["labels"]["alertname"].tame(check_string)
topic_name = NEW_TOPIC_TEMPLATE.format(alertname=alertname)
body += "**" + alertname + "**\n\n"
else:
# if no alertname, fallback to the alert fingerprint
topic_name = NEW_TOPIC_TEMPLATE.format(
alertname=alert["fingerprint"].tame(check_string)
)
body += START_TIME_TEMPLATE.format(start_time=alert["startsAt"].tame(check_string))
@ -84,6 +95,19 @@ def api_grafana_webhook(
label_information += "- " + key + ": " + value.tame(check_string) + "\n"
body += MESSAGE_LABELS_TEMPLATE.format(label_information=label_information)
if alert.get("values"):
value_information = ""
for key, value in alert["values"].items():
value_information += "- " + key + ": " + str(value.tame(check_anything)) + "\n"
body += MESSAGE_VALUES_TEMPLATE.format(value_information=value_information)
elif alert.get("valueString"):
body += (
MESSAGE_VALUES_TEMPLATE.format(
value_information=alert["valueString"].tame(check_string)
)
+ "\n"
)
if alert["annotations"]:
annotation_information = ""
for key, value in alert["annotations"].items():
@ -92,17 +116,30 @@ def api_grafana_webhook(
annotation_information=annotation_information
)
if payload["truncatedAlerts"]:
body += TRUNCATED_ALERTS_TEMPLATE.format(
count=payload["truncatedAlerts"].tame(check_int)
)
if alert["generatorURL"]:
body += MESSAGE_GENERATOR_TEMPLATE.format(
generator_url=alert["generatorURL"].tame(check_string)
)
check_send_webhook_message(request, user_profile, topic_name, body, status)
if alert["silenceURL"]:
body += MESSAGE_SILENCE_TEMPLATE.format(
silence_url=alert["silenceURL"].tame(check_string)
)
if alert.get("imageURL"):
body += MESSAGE_IMAGE_TEMPLATE.format(
image_url=alert["imageURL"].tame(check_string)
)
body += "\n"
check_send_webhook_message(request, user_profile, topic_name, body, status)
return json_success(request)
# Legacy Grafana alerts.
else:
# Grafana 7.0 alerts:
# https://grafana.com/docs/grafana/v7.0/alerting/notifications/#webhook
topic_name = OLD_TOPIC_TEMPLATE.format(alert_title=payload["title"].tame(check_string))
eval_matches_text = ""