From db7dc3f2af142ef789ba9d72f0f8e584a05c5cb9 Mon Sep 17 00:00:00 2001 From: Micha Date: Sun, 17 May 2026 11:34:19 +0200 Subject: [PATCH] Add ntfy alert delivery for monitoring --- docs/REPO_MAP.md | 2 +- docs/SERVICE_CATALOG.md | 4 +- monitoring/README.md | 15 +++ monitoring/alertmanager-ntfy-bridge/bridge.py | 112 ++++++++++++++++++ monitoring/alertmanager/alertmanager.yml | 25 ++++ monitoring/docker-compose.yml | 40 +++++++ monitoring/prometheus/prometheus.yml | 6 + 7 files changed, 202 insertions(+), 2 deletions(-) create mode 100644 monitoring/alertmanager-ntfy-bridge/bridge.py create mode 100644 monitoring/alertmanager/alertmanager.yml diff --git a/docs/REPO_MAP.md b/docs/REPO_MAP.md index 155e63d..e810afe 100644 --- a/docs/REPO_MAP.md +++ b/docs/REPO_MAP.md @@ -109,7 +109,7 @@ Secret-Werte werden hier nicht dokumentiert. Aufgefuehrt werden nur Variablennam | Glances | `ops/glances/docker-compose.yml` | `glances` -> `nicolargo/glances:latest-full@sha256:...` | `glances.kaleschke.info` | `frontend_net` | keine | Rootfs/Docker-Socket fuer Monitoring | | Grafana/InfluxDB | `ops/grafana-influxdb/docker-compose.yml` | `grafana`, `influxdb3-core` | `grafana.kaleschke.info` | `frontend_net`, `grafana_influx_internal`, `grafana_influx_lan` | `influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | abgeloester Altstand; nach erfolgreicher Migration durch `monitoring/` ersetzen | | Loki/Alloy | `ops/loki/docker-compose.yml` | `loki`, `alloy` | keine | `backend_net` | keine | abgeloester Altstand; nach erfolgreicher Migration durch `monitoring-loki`/`monitoring-promtail` ersetzen | -| Monitoring | `monitoring/docker-compose.yml` | `monitoring-prometheus`, `monitoring-blackbox-exporter`, `monitoring-loki`, `monitoring-promtail`, `monitoring-grafana`, `monitoring-node-exporter`, `monitoring-cadvisor`, `monitoring-influxdb3-core`, optional `monitoring-grafana-dashboard-importer` | `monitoring.kaleschke.info` | `frontend_net`, `monitoring_net`, `monitoring_influx_lan` | `monitoring-influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | zentraler Zielstack fuer Prometheus/Loki/Grafana/InfluxDB; Blackbox ersetzt Uptime-Kuma-Checks nach Parallelphase; Promtail nutzt Docker socket read-only; Dashboard-Importer nur via `bootstrap`-Profil | +| Monitoring | `monitoring/docker-compose.yml` | `monitoring-prometheus`, `monitoring-alertmanager`, `monitoring-alertmanager-ntfy-bridge`, `monitoring-blackbox-exporter`, `monitoring-loki`, `monitoring-promtail`, `monitoring-grafana`, `monitoring-node-exporter`, `monitoring-cadvisor`, `monitoring-influxdb3-core`, optional `monitoring-grafana-dashboard-importer` | `monitoring.kaleschke.info` | `frontend_net`, `monitoring_net`, `monitoring_influx_lan` | `monitoring-influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | zentraler Zielstack fuer Prometheus/Loki/Grafana/InfluxDB; Alertmanager sendet via ntfy-Bridge nach `homelab-alerts`; Blackbox ersetzt Uptime-Kuma-Checks nach Parallelphase; Promtail nutzt Docker socket read-only; Dashboard-Importer nur via `bootstrap`-Profil | | Hermes Agent | `ops/hermes-agent/docker-compose.yml` | `hermes-gateway`, `hermes-dashboard` -> local build from Dockerfile | `hermes.kaleschke.info` via `${HERMES_DASHBOARD_HOST}` | `hermes_net`, dashboard zusaetzlich `frontend_net` | `8642` nur expose intern | SSH runner, Home Assistant optional, LLM provider env; Dashboard hinter Authelia | | Komodo | `ops/komodo/docker-compose.yml` | `komodo-core`, `komodo-mongo`, `komodo-periphery` | `komodo.kaleschke.info` | `frontend_net`, `komodo_net` | keine | Mongo, Docker socket, `/mnt/user/services` workspace mount, Gitea DNS override | | Scrutiny | `ops/scrutiny/docker-compose.yml` | `scrutiny` -> `ghcr.io/starosdev/scrutiny:latest-omnibus@sha256:...` | `scrutiny.kaleschke.info` | `frontend_net` | keine | `privileged: true`, device mounts fuer SMART | diff --git a/docs/SERVICE_CATALOG.md b/docs/SERVICE_CATALOG.md index 4c61506..aa19d56 100644 --- a/docs/SERVICE_CATALOG.md +++ b/docs/SERVICE_CATALOG.md @@ -68,7 +68,9 @@ Secret-Werte sind nicht enthalten. Es werden nur Secret-Namen, Env-Key-Namen und | `grafana` | abgeloester Altstand fuer Grafana/InfluxDB | `ops/grafana-influxdb/docker-compose.yml` | `https://grafana.kaleschke.info` | Traefik + Authelia, InfluxDB 3 Core | `/mnt/user/appdata/grafana`, Grafana provisioning | Tier 3, `grafana.sqlite` | ja + Authelia | Nicht parallel zum neuen `monitoring/`-Zielstack betreiben; bleibt vorerst als Rollback-/Migrationsreferenz | | `influxdb3-core` | abgeloester Altstand fuer Home-Assistant-Langzeitdaten | `ops/grafana-influxdb/docker-compose.yml` | LAN `8181` je `INFLUXDB_BIND_IP`, keine Public URL | Grafana, Home Assistant Writer | `/mnt/user/appdata/influxdb3/data`, `/mnt/user/appdata/influxdb3/plugins` | Tier 3 | nein | Nach erfolgreicher Migration durch `monitoring-influxdb3-core` ersetzen; alten Datenpfad nicht blind loeschen | | `monitoring-grafana` | zentrale Observability-UI fuer Metriken, Logs und InfluxDB | `monitoring/docker-compose.yml` | `https://monitoring.kaleschke.info` | Traefik + Authelia, Prometheus, Loki, InfluxDB 3 Core | named volume `grafana_data`, Provisioning unter `monitoring/grafana/provisioning`, Dashboards unter `monitoring/grafana/dashboards` | Tier 3, named volume | ja + Authelia | Admin-Passwort ueber `monitoring_grafana_admin_password.txt`; Zielbestand: `Homelab / Availability`, `Homelab / Host Overview`, `Homelab / Containers + Logs`, `Traefik Official Standalone Dashboard`; Dashboard-Importer ist optionales `bootstrap`-Profil fuer Traefik | -| `monitoring-prometheus` | Metrik-Speicher fuer Homelab-Monitoring | `monitoring/docker-compose.yml`, `monitoring/prometheus/prometheus.yml`, `monitoring/prometheus/alerts.yml` | intern `http://prometheus:9090` | `monitoring_net`, node-exporter, cAdvisor, Traefik-Metrics, Blackbox Exporter | named volume `prometheus_data` | Tier 3, transiente Metriken mit 30 Tagen Retention | nein | Scrapes: Prometheus, node-exporter, cAdvisor, Traefik `:8082`, `blackbox-http`; Alerts werden in Prometheus ausgewertet, Grafana-Contact-Point folgt separat | +| `monitoring-prometheus` | Metrik-Speicher fuer Homelab-Monitoring | `monitoring/docker-compose.yml`, `monitoring/prometheus/prometheus.yml`, `monitoring/prometheus/alerts.yml` | intern `http://prometheus:9090` | `monitoring_net`, node-exporter, cAdvisor, Traefik-Metrics, Blackbox Exporter, Alertmanager | named volume `prometheus_data` | Tier 3, transiente Metriken mit 30 Tagen Retention | nein | Scrapes: Prometheus, node-exporter, cAdvisor, Traefik `:8082`, `blackbox-http`; Prometheus-Regeln senden an Alertmanager und von dort nach ntfy | +| `monitoring-alertmanager` | Alert-Routing fuer Prometheus-Regeln | `monitoring/docker-compose.yml`, `monitoring/alertmanager/alertmanager.yml` | intern `:9093` | Prometheus, ntfy Bridge | named volume `alertmanager_data` | Tier 3 | nein | sendet firing und resolved Alerts an `monitoring-alertmanager-ntfy-bridge` | +| `monitoring-alertmanager-ntfy-bridge` | Alertmanager-Webhook nach ntfy Push | `monitoring/docker-compose.yml`, `monitoring/alertmanager-ntfy-bridge/bridge.py` | intern `:8080` | Alertmanager, `https://ntfy.kaleschke.info/homelab-alerts` | kein kritischer Zustand | rebuildbar | nein | formatiert Alertmanager JSON als ntfy Titel, Nachricht, Priority und Tags; keine Secrets | | `monitoring-blackbox-exporter` | HTTP-Erreichbarkeitspruefungen fuer Uptime-Kuma-Abloesung | `monitoring/docker-compose.yml`, `monitoring/blackbox/blackbox.yml` | intern `:9115` | Prometheus, externe HTTPS-Ziele | kein kritischer Zustand | rebuildbar | nein | Uptime Kuma erst nach sieben Tagen Parallelbetrieb und Grafana-Alerting-Paritaet stoppen | | `monitoring-loki` | Logspeicher fuer Monitoring-Stack | `monitoring/docker-compose.yml`, `monitoring/loki/loki-config.yml` | intern `http://loki:3100` | `monitoring_net`, Promtail, Grafana | named volume `loki_data` | Tier 3, transiente Logs mit 30 Tagen Retention | nein | Von bestehendem `ops/loki` getrennt; Doppelbetrieb bewusst pruefen | | `monitoring-promtail` | Docker-Log-Collector fuer Monitoring-Loki | `monitoring/docker-compose.yml`, `monitoring/promtail/promtail-config.yml` | intern | Docker socket read-only, Docker json-file Logs, Loki | named volume `promtail_positions` | rebuildbar | nein | Dokumentierte Host-Observability-Ausnahme: `/var/run/docker.sock:/var/run/docker.sock:ro` und `/var/lib/docker/containers:ro`; keine Appdaten, nur Log-Discovery | diff --git a/monitoring/README.md b/monitoring/README.md index 3bd753f..7b71a92 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -6,6 +6,8 @@ Zielzustand: ein zentraler Observability-Stack fuer KalliLab CORE. - `monitoring-grafana`: zentrale UI unter `https://monitoring.kaleschke.info` - `monitoring-prometheus`: Metriken mit 30 Tagen Retention +- `monitoring-alertmanager`: Alert-Routing fuer Prometheus-Regeln +- `monitoring-alertmanager-ntfy-bridge`: uebersetzt Alertmanager-Webhooks zu ntfy-Pushes - `monitoring-loki`: Container-Logs mit 30 Tagen Retention - `monitoring-promtail`: Docker-Log-Discovery ueber read-only Docker-Socket - `monitoring-node-exporter`: Host-Metriken @@ -57,6 +59,7 @@ INFLUXDB_BIND_IP=192.168.178.58 - `https://monitoring.kaleschke.info` leitet zu Authelia. - Grafana-Datasources `Prometheus`, `Loki` und `InfluxDB 3 Core` testen erfolgreich. - Prometheus Targets: `prometheus`, `node-exporter`, `cadvisor`, `traefik`, `blackbox-http`. +- Alertmanager ist erreichbar und sendet ueber `monitoring-alertmanager-ntfy-bridge` nach `https://ntfy.kaleschke.info/homelab-alerts`. - Loki zeigt Container-Logs mit Labels `container`, `compose_project`, `compose_service`. - InfluxDB 3 Core enthaelt die Datenbank `homelab`. @@ -66,3 +69,15 @@ INFLUXDB_BIND_IP=192.168.178.58 - Glances erst stoppen, wenn `Homelab / Host Overview` und `Homelab / Containers + Logs` fuer CPU, RAM, Disk, Network, Container-CPU und Container-RAM passen. - Uptime Kuma erst stoppen, wenn `Homelab / Availability` und Grafana-Alerting mindestens sieben Tage parallel sauber laufen. - Dashboard-Zielbestand: `Homelab / Availability`, `Homelab / Containers + Logs`, `Homelab / Host Overview`, `Traefik Official Standalone Dashboard`. + +## Alerting + +Prometheus wertet `monitoring/prometheus/alerts.yml` aus und sendet an `monitoring-alertmanager`. +Alertmanager routet alle Alerts an den ntfy-Bridge-Container. +Der Bridge-Container postet nach `https://ntfy.kaleschke.info/homelab-alerts`. + +Test: + +```bash +curl -fsS http://alertmanager-ntfy-bridge:8080/healthz +``` diff --git a/monitoring/alertmanager-ntfy-bridge/bridge.py b/monitoring/alertmanager-ntfy-bridge/bridge.py new file mode 100644 index 0000000..1a1820c --- /dev/null +++ b/monitoring/alertmanager-ntfy-bridge/bridge.py @@ -0,0 +1,112 @@ +import json +import os +import sys +import urllib.error +import urllib.request +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer + + +NTFY_URL = os.environ.get("NTFY_URL", "https://ntfy.kaleschke.info/homelab-alerts") + + +def priority_for(status, severity): + if status == "resolved": + return "2" + if severity == "critical": + return "5" + if severity == "warning": + return "4" + return "3" + + +def tags_for(status, severity): + if status == "resolved": + return "white_check_mark" + if severity == "critical": + return "rotating_light" + if severity == "warning": + return "warning" + return "information_source" + + +def alert_message(alert): + labels = alert.get("labels", {}) + annotations = alert.get("annotations", {}) + status = alert.get("status", "firing") + severity = labels.get("severity", "info") + alertname = labels.get("alertname", "Alert") + target = labels.get("instance") or labels.get("service") or labels.get("mountpoint") or "homelab" + summary = annotations.get("summary") or alertname + description = annotations.get("description") or "" + + title = f"{status.upper()} {severity}: {alertname}" + lines = [ + summary, + f"Target: {target}", + ] + if description and description != summary: + lines.append(description) + return title, "\n".join(lines), priority_for(status, severity), tags_for(status, severity) + + +def send_ntfy(title, message, priority, tags): + req = urllib.request.Request( + NTFY_URL, + data=message.encode("utf-8"), + headers={ + "Title": title, + "Priority": priority, + "Tags": tags, + }, + method="POST", + ) + with urllib.request.urlopen(req, timeout=15) as response: + response.read() + + +class Handler(BaseHTTPRequestHandler): + def do_GET(self): + if self.path == "/healthz": + self.send_response(200) + self.end_headers() + self.wfile.write(b"ok\n") + return + self.send_response(404) + self.end_headers() + + def do_POST(self): + if self.path != "/alertmanager": + self.send_response(404) + self.end_headers() + return + + length = int(self.headers.get("Content-Length", "0")) + payload = json.loads(self.rfile.read(length) or b"{}") + alerts = payload.get("alerts", []) + sent = 0 + + for alert in alerts: + title, message, priority, tags = alert_message(alert) + try: + send_ntfy(title, message, priority, tags) + sent += 1 + except urllib.error.URLError as exc: + print(f"ntfy send failed: {exc}", file=sys.stderr, flush=True) + self.send_response(502) + self.end_headers() + self.wfile.write(b"ntfy send failed\n") + return + + print(f"sent {sent} ntfy notifications", flush=True) + self.send_response(200) + self.end_headers() + self.wfile.write(f"sent {sent}\n".encode("utf-8")) + + def log_message(self, fmt, *args): + print(fmt % args, flush=True) + + +if __name__ == "__main__": + server = ThreadingHTTPServer(("0.0.0.0", 8080), Handler) + print(f"alertmanager ntfy bridge listening on :8080 -> {NTFY_URL}", flush=True) + server.serve_forever() diff --git a/monitoring/alertmanager/alertmanager.yml b/monitoring/alertmanager/alertmanager.yml new file mode 100644 index 0000000..3d5206b --- /dev/null +++ b/monitoring/alertmanager/alertmanager.yml @@ -0,0 +1,25 @@ +global: + resolve_timeout: 5m + +route: + receiver: ntfy-homelab + group_by: + - alertname + - severity + group_wait: 20s + group_interval: 5m + repeat_interval: 4h + routes: + - receiver: ntfy-homelab + matchers: + - severity="critical" + group_wait: 10s + group_interval: 2m + repeat_interval: 30m + +receivers: + - name: ntfy-homelab + webhook_configs: + - url: http://alertmanager-ntfy-bridge:8080/alertmanager + send_resolved: true + max_alerts: 10 diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index f75073d..e3a39e5 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -19,10 +19,49 @@ services: security_opt: - no-new-privileges:true depends_on: + - alertmanager - blackbox-exporter - node-exporter - cadvisor + alertmanager: + image: prom/alertmanager:v0.28.1 + container_name: monitoring-alertmanager + restart: unless-stopped + command: + - --config.file=/etc/alertmanager/alertmanager.yml + - --storage.path=/alertmanager + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro + - alertmanager_data:/alertmanager + networks: + - monitoring_net + expose: + - "9093" + security_opt: + - no-new-privileges:true + + alertmanager-ntfy-bridge: + image: python:3.13-alpine + container_name: monitoring-alertmanager-ntfy-bridge + restart: unless-stopped + dns: + - 1.1.1.1 + - 8.8.8.8 + environment: + NTFY_URL: https://ntfy.kaleschke.info/homelab-alerts + command: + - python + - /app/bridge.py + volumes: + - ./alertmanager-ntfy-bridge/bridge.py:/app/bridge.py:ro + networks: + - monitoring_net + expose: + - "8080" + security_opt: + - no-new-privileges:true + blackbox-exporter: image: prom/blackbox-exporter:v0.27.0 container_name: monitoring-blackbox-exporter @@ -311,6 +350,7 @@ networks: volumes: prometheus_data: + alertmanager_data: loki_data: promtail_positions: grafana_data: diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index 29151c5..44070f3 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -7,6 +7,12 @@ global: rule_files: - /etc/prometheus/alerts.yml +alerting: + alertmanagers: + - static_configs: + - targets: + - alertmanager:9093 + scrape_configs: - job_name: prometheus static_configs: