Add ntfy alert delivery for monitoring

This commit is contained in:
2026-05-17 11:34:19 +02:00
parent c748236886
commit db7dc3f2af
7 changed files with 202 additions and 2 deletions
+1 -1
View File
@@ -109,7 +109,7 @@ Secret-Werte werden hier nicht dokumentiert. Aufgefuehrt werden nur Variablennam
| Glances | `ops/glances/docker-compose.yml` | `glances` -> `nicolargo/glances:latest-full@sha256:...` | `glances.kaleschke.info` | `frontend_net` | keine | Rootfs/Docker-Socket fuer Monitoring |
| Grafana/InfluxDB | `ops/grafana-influxdb/docker-compose.yml` | `grafana`, `influxdb3-core` | `grafana.kaleschke.info` | `frontend_net`, `grafana_influx_internal`, `grafana_influx_lan` | `influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | abgeloester Altstand; nach erfolgreicher Migration durch `monitoring/` ersetzen |
| Loki/Alloy | `ops/loki/docker-compose.yml` | `loki`, `alloy` | keine | `backend_net` | keine | abgeloester Altstand; nach erfolgreicher Migration durch `monitoring-loki`/`monitoring-promtail` ersetzen |
| Monitoring | `monitoring/docker-compose.yml` | `monitoring-prometheus`, `monitoring-blackbox-exporter`, `monitoring-loki`, `monitoring-promtail`, `monitoring-grafana`, `monitoring-node-exporter`, `monitoring-cadvisor`, `monitoring-influxdb3-core`, optional `monitoring-grafana-dashboard-importer` | `monitoring.kaleschke.info` | `frontend_net`, `monitoring_net`, `monitoring_influx_lan` | `monitoring-influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | zentraler Zielstack fuer Prometheus/Loki/Grafana/InfluxDB; Blackbox ersetzt Uptime-Kuma-Checks nach Parallelphase; Promtail nutzt Docker socket read-only; Dashboard-Importer nur via `bootstrap`-Profil |
| Monitoring | `monitoring/docker-compose.yml` | `monitoring-prometheus`, `monitoring-alertmanager`, `monitoring-alertmanager-ntfy-bridge`, `monitoring-blackbox-exporter`, `monitoring-loki`, `monitoring-promtail`, `monitoring-grafana`, `monitoring-node-exporter`, `monitoring-cadvisor`, `monitoring-influxdb3-core`, optional `monitoring-grafana-dashboard-importer` | `monitoring.kaleschke.info` | `frontend_net`, `monitoring_net`, `monitoring_influx_lan` | `monitoring-influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | zentraler Zielstack fuer Prometheus/Loki/Grafana/InfluxDB; Alertmanager sendet via ntfy-Bridge nach `homelab-alerts`; Blackbox ersetzt Uptime-Kuma-Checks nach Parallelphase; Promtail nutzt Docker socket read-only; Dashboard-Importer nur via `bootstrap`-Profil |
| Hermes Agent | `ops/hermes-agent/docker-compose.yml` | `hermes-gateway`, `hermes-dashboard` -> local build from Dockerfile | `hermes.kaleschke.info` via `${HERMES_DASHBOARD_HOST}` | `hermes_net`, dashboard zusaetzlich `frontend_net` | `8642` nur expose intern | SSH runner, Home Assistant optional, LLM provider env; Dashboard hinter Authelia |
| Komodo | `ops/komodo/docker-compose.yml` | `komodo-core`, `komodo-mongo`, `komodo-periphery` | `komodo.kaleschke.info` | `frontend_net`, `komodo_net` | keine | Mongo, Docker socket, `/mnt/user/services` workspace mount, Gitea DNS override |
| Scrutiny | `ops/scrutiny/docker-compose.yml` | `scrutiny` -> `ghcr.io/starosdev/scrutiny:latest-omnibus@sha256:...` | `scrutiny.kaleschke.info` | `frontend_net` | keine | `privileged: true`, device mounts fuer SMART |
+3 -1
View File
@@ -68,7 +68,9 @@ Secret-Werte sind nicht enthalten. Es werden nur Secret-Namen, Env-Key-Namen und
| `grafana` | abgeloester Altstand fuer Grafana/InfluxDB | `ops/grafana-influxdb/docker-compose.yml` | `https://grafana.kaleschke.info` | Traefik + Authelia, InfluxDB 3 Core | `/mnt/user/appdata/grafana`, Grafana provisioning | Tier 3, `grafana.sqlite` | ja + Authelia | Nicht parallel zum neuen `monitoring/`-Zielstack betreiben; bleibt vorerst als Rollback-/Migrationsreferenz |
| `influxdb3-core` | abgeloester Altstand fuer Home-Assistant-Langzeitdaten | `ops/grafana-influxdb/docker-compose.yml` | LAN `8181` je `INFLUXDB_BIND_IP`, keine Public URL | Grafana, Home Assistant Writer | `/mnt/user/appdata/influxdb3/data`, `/mnt/user/appdata/influxdb3/plugins` | Tier 3 | nein | Nach erfolgreicher Migration durch `monitoring-influxdb3-core` ersetzen; alten Datenpfad nicht blind loeschen |
| `monitoring-grafana` | zentrale Observability-UI fuer Metriken, Logs und InfluxDB | `monitoring/docker-compose.yml` | `https://monitoring.kaleschke.info` | Traefik + Authelia, Prometheus, Loki, InfluxDB 3 Core | named volume `grafana_data`, Provisioning unter `monitoring/grafana/provisioning`, Dashboards unter `monitoring/grafana/dashboards` | Tier 3, named volume | ja + Authelia | Admin-Passwort ueber `monitoring_grafana_admin_password.txt`; Zielbestand: `Homelab / Availability`, `Homelab / Host Overview`, `Homelab / Containers + Logs`, `Traefik Official Standalone Dashboard`; Dashboard-Importer ist optionales `bootstrap`-Profil fuer Traefik |
| `monitoring-prometheus` | Metrik-Speicher fuer Homelab-Monitoring | `monitoring/docker-compose.yml`, `monitoring/prometheus/prometheus.yml`, `monitoring/prometheus/alerts.yml` | intern `http://prometheus:9090` | `monitoring_net`, node-exporter, cAdvisor, Traefik-Metrics, Blackbox Exporter | named volume `prometheus_data` | Tier 3, transiente Metriken mit 30 Tagen Retention | nein | Scrapes: Prometheus, node-exporter, cAdvisor, Traefik `:8082`, `blackbox-http`; Alerts werden in Prometheus ausgewertet, Grafana-Contact-Point folgt separat |
| `monitoring-prometheus` | Metrik-Speicher fuer Homelab-Monitoring | `monitoring/docker-compose.yml`, `monitoring/prometheus/prometheus.yml`, `monitoring/prometheus/alerts.yml` | intern `http://prometheus:9090` | `monitoring_net`, node-exporter, cAdvisor, Traefik-Metrics, Blackbox Exporter, Alertmanager | named volume `prometheus_data` | Tier 3, transiente Metriken mit 30 Tagen Retention | nein | Scrapes: Prometheus, node-exporter, cAdvisor, Traefik `:8082`, `blackbox-http`; Prometheus-Regeln senden an Alertmanager und von dort nach ntfy |
| `monitoring-alertmanager` | Alert-Routing fuer Prometheus-Regeln | `monitoring/docker-compose.yml`, `monitoring/alertmanager/alertmanager.yml` | intern `:9093` | Prometheus, ntfy Bridge | named volume `alertmanager_data` | Tier 3 | nein | sendet firing und resolved Alerts an `monitoring-alertmanager-ntfy-bridge` |
| `monitoring-alertmanager-ntfy-bridge` | Alertmanager-Webhook nach ntfy Push | `monitoring/docker-compose.yml`, `monitoring/alertmanager-ntfy-bridge/bridge.py` | intern `:8080` | Alertmanager, `https://ntfy.kaleschke.info/homelab-alerts` | kein kritischer Zustand | rebuildbar | nein | formatiert Alertmanager JSON als ntfy Titel, Nachricht, Priority und Tags; keine Secrets |
| `monitoring-blackbox-exporter` | HTTP-Erreichbarkeitspruefungen fuer Uptime-Kuma-Abloesung | `monitoring/docker-compose.yml`, `monitoring/blackbox/blackbox.yml` | intern `:9115` | Prometheus, externe HTTPS-Ziele | kein kritischer Zustand | rebuildbar | nein | Uptime Kuma erst nach sieben Tagen Parallelbetrieb und Grafana-Alerting-Paritaet stoppen |
| `monitoring-loki` | Logspeicher fuer Monitoring-Stack | `monitoring/docker-compose.yml`, `monitoring/loki/loki-config.yml` | intern `http://loki:3100` | `monitoring_net`, Promtail, Grafana | named volume `loki_data` | Tier 3, transiente Logs mit 30 Tagen Retention | nein | Von bestehendem `ops/loki` getrennt; Doppelbetrieb bewusst pruefen |
| `monitoring-promtail` | Docker-Log-Collector fuer Monitoring-Loki | `monitoring/docker-compose.yml`, `monitoring/promtail/promtail-config.yml` | intern | Docker socket read-only, Docker json-file Logs, Loki | named volume `promtail_positions` | rebuildbar | nein | Dokumentierte Host-Observability-Ausnahme: `/var/run/docker.sock:/var/run/docker.sock:ro` und `/var/lib/docker/containers:ro`; keine Appdaten, nur Log-Discovery |
+15
View File
@@ -6,6 +6,8 @@ Zielzustand: ein zentraler Observability-Stack fuer KalliLab CORE.
- `monitoring-grafana`: zentrale UI unter `https://monitoring.kaleschke.info`
- `monitoring-prometheus`: Metriken mit 30 Tagen Retention
- `monitoring-alertmanager`: Alert-Routing fuer Prometheus-Regeln
- `monitoring-alertmanager-ntfy-bridge`: uebersetzt Alertmanager-Webhooks zu ntfy-Pushes
- `monitoring-loki`: Container-Logs mit 30 Tagen Retention
- `monitoring-promtail`: Docker-Log-Discovery ueber read-only Docker-Socket
- `monitoring-node-exporter`: Host-Metriken
@@ -57,6 +59,7 @@ INFLUXDB_BIND_IP=192.168.178.58
- `https://monitoring.kaleschke.info` leitet zu Authelia.
- Grafana-Datasources `Prometheus`, `Loki` und `InfluxDB 3 Core` testen erfolgreich.
- Prometheus Targets: `prometheus`, `node-exporter`, `cadvisor`, `traefik`, `blackbox-http`.
- Alertmanager ist erreichbar und sendet ueber `monitoring-alertmanager-ntfy-bridge` nach `https://ntfy.kaleschke.info/homelab-alerts`.
- Loki zeigt Container-Logs mit Labels `container`, `compose_project`, `compose_service`.
- InfluxDB 3 Core enthaelt die Datenbank `homelab`.
@@ -66,3 +69,15 @@ INFLUXDB_BIND_IP=192.168.178.58
- Glances erst stoppen, wenn `Homelab / Host Overview` und `Homelab / Containers + Logs` fuer CPU, RAM, Disk, Network, Container-CPU und Container-RAM passen.
- Uptime Kuma erst stoppen, wenn `Homelab / Availability` und Grafana-Alerting mindestens sieben Tage parallel sauber laufen.
- Dashboard-Zielbestand: `Homelab / Availability`, `Homelab / Containers + Logs`, `Homelab / Host Overview`, `Traefik Official Standalone Dashboard`.
## Alerting
Prometheus wertet `monitoring/prometheus/alerts.yml` aus und sendet an `monitoring-alertmanager`.
Alertmanager routet alle Alerts an den ntfy-Bridge-Container.
Der Bridge-Container postet nach `https://ntfy.kaleschke.info/homelab-alerts`.
Test:
```bash
curl -fsS http://alertmanager-ntfy-bridge:8080/healthz
```
@@ -0,0 +1,112 @@
import json
import os
import sys
import urllib.error
import urllib.request
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
NTFY_URL = os.environ.get("NTFY_URL", "https://ntfy.kaleschke.info/homelab-alerts")
def priority_for(status, severity):
if status == "resolved":
return "2"
if severity == "critical":
return "5"
if severity == "warning":
return "4"
return "3"
def tags_for(status, severity):
if status == "resolved":
return "white_check_mark"
if severity == "critical":
return "rotating_light"
if severity == "warning":
return "warning"
return "information_source"
def alert_message(alert):
labels = alert.get("labels", {})
annotations = alert.get("annotations", {})
status = alert.get("status", "firing")
severity = labels.get("severity", "info")
alertname = labels.get("alertname", "Alert")
target = labels.get("instance") or labels.get("service") or labels.get("mountpoint") or "homelab"
summary = annotations.get("summary") or alertname
description = annotations.get("description") or ""
title = f"{status.upper()} {severity}: {alertname}"
lines = [
summary,
f"Target: {target}",
]
if description and description != summary:
lines.append(description)
return title, "\n".join(lines), priority_for(status, severity), tags_for(status, severity)
def send_ntfy(title, message, priority, tags):
req = urllib.request.Request(
NTFY_URL,
data=message.encode("utf-8"),
headers={
"Title": title,
"Priority": priority,
"Tags": tags,
},
method="POST",
)
with urllib.request.urlopen(req, timeout=15) as response:
response.read()
class Handler(BaseHTTPRequestHandler):
def do_GET(self):
if self.path == "/healthz":
self.send_response(200)
self.end_headers()
self.wfile.write(b"ok\n")
return
self.send_response(404)
self.end_headers()
def do_POST(self):
if self.path != "/alertmanager":
self.send_response(404)
self.end_headers()
return
length = int(self.headers.get("Content-Length", "0"))
payload = json.loads(self.rfile.read(length) or b"{}")
alerts = payload.get("alerts", [])
sent = 0
for alert in alerts:
title, message, priority, tags = alert_message(alert)
try:
send_ntfy(title, message, priority, tags)
sent += 1
except urllib.error.URLError as exc:
print(f"ntfy send failed: {exc}", file=sys.stderr, flush=True)
self.send_response(502)
self.end_headers()
self.wfile.write(b"ntfy send failed\n")
return
print(f"sent {sent} ntfy notifications", flush=True)
self.send_response(200)
self.end_headers()
self.wfile.write(f"sent {sent}\n".encode("utf-8"))
def log_message(self, fmt, *args):
print(fmt % args, flush=True)
if __name__ == "__main__":
server = ThreadingHTTPServer(("0.0.0.0", 8080), Handler)
print(f"alertmanager ntfy bridge listening on :8080 -> {NTFY_URL}", flush=True)
server.serve_forever()
+25
View File
@@ -0,0 +1,25 @@
global:
resolve_timeout: 5m
route:
receiver: ntfy-homelab
group_by:
- alertname
- severity
group_wait: 20s
group_interval: 5m
repeat_interval: 4h
routes:
- receiver: ntfy-homelab
matchers:
- severity="critical"
group_wait: 10s
group_interval: 2m
repeat_interval: 30m
receivers:
- name: ntfy-homelab
webhook_configs:
- url: http://alertmanager-ntfy-bridge:8080/alertmanager
send_resolved: true
max_alerts: 10
+40
View File
@@ -19,10 +19,49 @@ services:
security_opt:
- no-new-privileges:true
depends_on:
- alertmanager
- blackbox-exporter
- node-exporter
- cadvisor
alertmanager:
image: prom/alertmanager:v0.28.1
container_name: monitoring-alertmanager
restart: unless-stopped
command:
- --config.file=/etc/alertmanager/alertmanager.yml
- --storage.path=/alertmanager
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
networks:
- monitoring_net
expose:
- "9093"
security_opt:
- no-new-privileges:true
alertmanager-ntfy-bridge:
image: python:3.13-alpine
container_name: monitoring-alertmanager-ntfy-bridge
restart: unless-stopped
dns:
- 1.1.1.1
- 8.8.8.8
environment:
NTFY_URL: https://ntfy.kaleschke.info/homelab-alerts
command:
- python
- /app/bridge.py
volumes:
- ./alertmanager-ntfy-bridge/bridge.py:/app/bridge.py:ro
networks:
- monitoring_net
expose:
- "8080"
security_opt:
- no-new-privileges:true
blackbox-exporter:
image: prom/blackbox-exporter:v0.27.0
container_name: monitoring-blackbox-exporter
@@ -311,6 +350,7 @@ networks:
volumes:
prometheus_data:
alertmanager_data:
loki_data:
promtail_positions:
grafana_data:
+6
View File
@@ -7,6 +7,12 @@ global:
rule_files:
- /etc/prometheus/alerts.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
scrape_configs:
- job_name: prometheus
static_configs: