Prepare monitoring alert rules
This commit is contained in:
@@ -62,6 +62,7 @@ INFLUXDB_BIND_IP=192.168.178.58
|
||||
- `https://monitoring.kaleschke.info` leitet zu Authelia.
|
||||
- Grafana-Datasources `Prometheus`, `Loki` und `InfluxDB 3 Core` testen erfolgreich.
|
||||
- Prometheus Targets: `prometheus`, `node-exporter`, `cadvisor`, `traefik`, `blackbox-http`.
|
||||
- Node Exporter Textfile Collector: `/mnt/user/services/posture-check/textfile/homelab.prom` wird vom Host-Skript `services/posture-check/export-prometheus-textfile.sh` befuellt.
|
||||
- Alertmanager ist erreichbar und sendet ueber `monitoring-alertmanager-ntfy-bridge` nach `https://ntfy.kaleschke.info/homelab-alerts`.
|
||||
- Loki zeigt Container-Logs mit Labels `container`, `compose_project`, `compose_service`.
|
||||
- InfluxDB 3 Core enthaelt die Datenbank `homelab`.
|
||||
@@ -83,9 +84,17 @@ Blackbox-HTTP-Alerts unterscheiden zwischen einem einzelnen kaputten Endpoint un
|
||||
|
||||
- `HomelabExternalConnectivityDown` feuert, wenn mindestens 5 Public-Endpoints gleichzeitig fuer 8 Minuten nicht erreichbar sind. Das deckt WAN-, DNS- oder Provider-Ausfaelle ab, inklusive laengerer DSL-Reconnects.
|
||||
- `HomelabEndpointDown` feuert fuer einzelne Endpoints erst nach 8 Minuten und wird unterdrueckt, solange der Sammelalert aktiv ist. Dadurch erzeugt ein Telekom-24h-Reconnect keine ntfy-Flut pro Domain.
|
||||
- `HomelabCertificateExpiresSoon` und `HomelabCertificateExpiresCritical` nutzen Blackbox TLS-Metriken fuer 21-/7-Tage-Warnungen.
|
||||
- `HomelabBorgBackupStale`, `HomelabBorgLastJobFailed`, `HomelabBorgLastJobCompletedWithWarnings` und `HomelabCriticalContainerDown` nutzen Host-Textfile-Metriken. Voraussetzung: `services/posture-check/export-prometheus-textfile.sh` laeuft regelmaessig auf dem Host, empfohlen alle 15 Minuten.
|
||||
|
||||
Test:
|
||||
|
||||
```bash
|
||||
curl -fsS http://alertmanager-ntfy-bridge:8080/healthz
|
||||
```
|
||||
|
||||
Textfile-Metriken manuell aktualisieren:
|
||||
|
||||
```bash
|
||||
bash /mnt/user/services/homelab-infra/services/posture-check/export-prometheus-textfile.sh
|
||||
```
|
||||
|
||||
@@ -280,11 +280,13 @@ services:
|
||||
- --path.procfs=/host/proc
|
||||
- --path.sysfs=/host/sys
|
||||
- --path.rootfs=/rootfs
|
||||
- --collector.textfile.directory=/textfile
|
||||
- --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker/.+|var/lib/containers/storage/.+)($|/)
|
||||
volumes:
|
||||
- /proc:/host/proc:ro
|
||||
- /sys:/host/sys:ro
|
||||
- /:/rootfs:ro
|
||||
- /mnt/user/services/posture-check/textfile:/textfile:ro
|
||||
networks:
|
||||
- monitoring_net
|
||||
expose:
|
||||
|
||||
@@ -28,6 +28,24 @@ groups:
|
||||
summary: "{{ $labels.instance }} is slow"
|
||||
description: "Blackbox probe duration is above 5 seconds for {{ $labels.instance }}."
|
||||
|
||||
- alert: HomelabCertificateExpiresSoon
|
||||
expr: (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) < 21 * 24 * 3600 and (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) > 7 * 24 * 3600
|
||||
for: 30m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "TLS certificate expires soon for {{ $labels.instance }}"
|
||||
description: "The earliest certificate expiry for {{ $labels.instance }} is below 21 days."
|
||||
|
||||
- alert: HomelabCertificateExpiresCritical
|
||||
expr: (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) <= 7 * 24 * 3600
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "TLS certificate is close to expiry for {{ $labels.instance }}"
|
||||
description: "The earliest certificate expiry for {{ $labels.instance }} is at or below 7 days, or already expired."
|
||||
|
||||
- name: homelab-host
|
||||
rules:
|
||||
- alert: HomelabDiskAlmostFull
|
||||
@@ -56,3 +74,59 @@ groups:
|
||||
annotations:
|
||||
summary: "Traefik 5xx responses for {{ $labels.service }}"
|
||||
description: "Traefik reports at least 5 5xx responses for {{ $labels.service }} within 5 minutes."
|
||||
|
||||
- name: homelab-backup-and-containers
|
||||
rules:
|
||||
- alert: HomelabTextfileExporterStale
|
||||
expr: time() - homelab_textfile_exporter_last_run_timestamp_seconds > 2 * 60 * 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Homelab textfile metrics are stale"
|
||||
description: "The host textfile exporter has not refreshed metrics for more than 2 hours."
|
||||
|
||||
- alert: HomelabBorgMetricsMissing
|
||||
expr: absent(homelab_borg_last_completed_timestamp_seconds)
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Borg backup metrics are missing"
|
||||
description: "Prometheus cannot see the homelab_borg_last_completed_timestamp_seconds metric."
|
||||
|
||||
- alert: HomelabBorgBackupStale
|
||||
expr: time() - homelab_borg_last_completed_timestamp_seconds > 30 * 60 * 60
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Borg backup is stale"
|
||||
description: "The latest completed Borg backup is older than 30 hours."
|
||||
|
||||
- alert: HomelabBorgLastJobFailed
|
||||
expr: homelab_borg_last_success != 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Latest Borg backup did not complete successfully"
|
||||
description: "The latest Borg UI job status is {{ $labels.status }} for archive {{ $labels.archive }}."
|
||||
|
||||
- alert: HomelabBorgLastJobCompletedWithWarnings
|
||||
expr: homelab_borg_last_job_warning == 1
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Latest Borg backup completed with warnings"
|
||||
description: "The latest Borg UI job completed with warnings for archive {{ $labels.archive }}."
|
||||
|
||||
- alert: HomelabCriticalContainerDown
|
||||
expr: homelab_critical_container_running == 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Critical container is down: {{ $labels.name }}"
|
||||
description: "The host textfile exporter reports that critical container {{ $labels.name }} is not running."
|
||||
|
||||
Reference in New Issue
Block a user