Add ntfy alert delivery for monitoring

This commit is contained in:
2026-05-17 11:34:19 +02:00
parent c748236886
commit db7dc3f2af
7 changed files with 202 additions and 2 deletions
+15
View File
@@ -6,6 +6,8 @@ Zielzustand: ein zentraler Observability-Stack fuer KalliLab CORE.
- `monitoring-grafana`: zentrale UI unter `https://monitoring.kaleschke.info`
- `monitoring-prometheus`: Metriken mit 30 Tagen Retention
- `monitoring-alertmanager`: Alert-Routing fuer Prometheus-Regeln
- `monitoring-alertmanager-ntfy-bridge`: uebersetzt Alertmanager-Webhooks zu ntfy-Pushes
- `monitoring-loki`: Container-Logs mit 30 Tagen Retention
- `monitoring-promtail`: Docker-Log-Discovery ueber read-only Docker-Socket
- `monitoring-node-exporter`: Host-Metriken
@@ -57,6 +59,7 @@ INFLUXDB_BIND_IP=192.168.178.58
- `https://monitoring.kaleschke.info` leitet zu Authelia.
- Grafana-Datasources `Prometheus`, `Loki` und `InfluxDB 3 Core` testen erfolgreich.
- Prometheus Targets: `prometheus`, `node-exporter`, `cadvisor`, `traefik`, `blackbox-http`.
- Alertmanager ist erreichbar und sendet ueber `monitoring-alertmanager-ntfy-bridge` nach `https://ntfy.kaleschke.info/homelab-alerts`.
- Loki zeigt Container-Logs mit Labels `container`, `compose_project`, `compose_service`.
- InfluxDB 3 Core enthaelt die Datenbank `homelab`.
@@ -66,3 +69,15 @@ INFLUXDB_BIND_IP=192.168.178.58
- Glances erst stoppen, wenn `Homelab / Host Overview` und `Homelab / Containers + Logs` fuer CPU, RAM, Disk, Network, Container-CPU und Container-RAM passen.
- Uptime Kuma erst stoppen, wenn `Homelab / Availability` und Grafana-Alerting mindestens sieben Tage parallel sauber laufen.
- Dashboard-Zielbestand: `Homelab / Availability`, `Homelab / Containers + Logs`, `Homelab / Host Overview`, `Traefik Official Standalone Dashboard`.
## Alerting
Prometheus wertet `monitoring/prometheus/alerts.yml` aus und sendet an `monitoring-alertmanager`.
Alertmanager routet alle Alerts an den ntfy-Bridge-Container.
Der Bridge-Container postet nach `https://ntfy.kaleschke.info/homelab-alerts`.
Test:
```bash
curl -fsS http://alertmanager-ntfy-bridge:8080/healthz
```
@@ -0,0 +1,112 @@
import json
import os
import sys
import urllib.error
import urllib.request
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
NTFY_URL = os.environ.get("NTFY_URL", "https://ntfy.kaleschke.info/homelab-alerts")
def priority_for(status, severity):
if status == "resolved":
return "2"
if severity == "critical":
return "5"
if severity == "warning":
return "4"
return "3"
def tags_for(status, severity):
if status == "resolved":
return "white_check_mark"
if severity == "critical":
return "rotating_light"
if severity == "warning":
return "warning"
return "information_source"
def alert_message(alert):
labels = alert.get("labels", {})
annotations = alert.get("annotations", {})
status = alert.get("status", "firing")
severity = labels.get("severity", "info")
alertname = labels.get("alertname", "Alert")
target = labels.get("instance") or labels.get("service") or labels.get("mountpoint") or "homelab"
summary = annotations.get("summary") or alertname
description = annotations.get("description") or ""
title = f"{status.upper()} {severity}: {alertname}"
lines = [
summary,
f"Target: {target}",
]
if description and description != summary:
lines.append(description)
return title, "\n".join(lines), priority_for(status, severity), tags_for(status, severity)
def send_ntfy(title, message, priority, tags):
req = urllib.request.Request(
NTFY_URL,
data=message.encode("utf-8"),
headers={
"Title": title,
"Priority": priority,
"Tags": tags,
},
method="POST",
)
with urllib.request.urlopen(req, timeout=15) as response:
response.read()
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == "/healthz":
self.send_response(200)
self.end_headers()
self.wfile.write(b"ok\n")
return
self.send_response(404)
self.end_headers()
def do_POST(self):
if self.path != "/alertmanager":
self.send_response(404)
self.end_headers()
return
length = int(self.headers.get("Content-Length", "0"))
payload = json.loads(self.rfile.read(length) or b"{}")
alerts = payload.get("alerts", [])
sent = 0
for alert in alerts:
title, message, priority, tags = alert_message(alert)
try:
send_ntfy(title, message, priority, tags)
sent += 1
except urllib.error.URLError as exc:
print(f"ntfy send failed: {exc}", file=sys.stderr, flush=True)
self.send_response(502)
self.end_headers()
self.wfile.write(b"ntfy send failed\n")
return
print(f"sent {sent} ntfy notifications", flush=True)
self.send_response(200)
self.end_headers()
self.wfile.write(f"sent {sent}\n".encode("utf-8"))
def log_message(self, fmt, *args):
print(fmt % args, flush=True)
if __name__ == "__main__":
server = ThreadingHTTPServer(("0.0.0.0", 8080), Handler)
print(f"alertmanager ntfy bridge listening on :8080 -> {NTFY_URL}", flush=True)
server.serve_forever()
+25
View File
@@ -0,0 +1,25 @@
global:
resolve_timeout: 5m
route:
receiver: ntfy-homelab
group_by:
- alertname
- severity
group_wait: 20s
group_interval: 5m
repeat_interval: 4h
routes:
- receiver: ntfy-homelab
matchers:
- severity="critical"
group_wait: 10s
group_interval: 2m
repeat_interval: 30m
receivers:
- name: ntfy-homelab
webhook_configs:
- url: http://alertmanager-ntfy-bridge:8080/alertmanager
send_resolved: true
max_alerts: 10
+40
View File
@@ -19,10 +19,49 @@ services:
security_opt:
- no-new-privileges:true
depends_on:
- alertmanager
- blackbox-exporter
- node-exporter
- cadvisor
alertmanager:
image: prom/alertmanager:v0.28.1
container_name: monitoring-alertmanager
restart: unless-stopped
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
networks:
- monitoring_net
expose:
- "9093"
security_opt:
- no-new-privileges:true
alertmanager-ntfy-bridge:
image: python:3.13-alpine
container_name: monitoring-alertmanager-ntfy-bridge
restart: unless-stopped
dns:
- 1.1.1.1
- 8.8.8.8
environment:
NTFY_URL: https://ntfy.kaleschke.info/homelab-alerts
command:
- python
- /app/bridge.py
volumes:
- ./alertmanager-ntfy-bridge/bridge.py:/app/bridge.py:ro
networks:
- monitoring_net
expose:
- "8080"
security_opt:
- no-new-privileges:true
blackbox-exporter:
image: prom/blackbox-exporter:v0.27.0
container_name: monitoring-blackbox-exporter
@@ -311,6 +350,7 @@ networks:
volumes:
prometheus_data:
alertmanager_data:
loki_data:
promtail_positions:
grafana_data:
+6
View File
@@ -7,6 +7,12 @@ global:
rule_files:
- /etc/prometheus/alerts.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: prometheus
static_configs: