bc9ace315a
Findings aus dem Backup-/Restore-Audit 2026-06-18 umgesetzt: - Dump-Frische als Prometheus-Metrik (homelab_borg_dump_present / homelab_borg_dump_age_seconds) im Host-Exporter; schliesst den Blindfleck, dass Borg weiterlaeuft und stale Dumps archiviert, ohne Job-Fehler. - Neue Alerts HomelabBorgDumpMissing / HomelabBorgDumpStale (critical) plus ALERT_RULES.md. - Freshness-Gate (.sh + .ps1) und H:-Nearline-Pull um n8n.sqlite.dump und postgresql17-globals.sql ergaenzt. - Critical-Container-Watch um mail-archiver, n8n, homeassistant, smarthome-mosquitto erweitert. - BACKUP_SCOPE: /mnt/user/projekte und sonstige User-Shares ausserhalb App-Scope als bewusste offene Operator-Entscheidung dokumentiert; Hermes-data-Pfad als geparkt klargestellt. - MASTER_TODO: Nearline-Pull-Ueberwachung, Host-Pull-Nachzug und projekte-Scope-Entscheidung aufgenommen. Enthaelt ausserdem die zuvor vorbereiteten Scope-Erweiterungen (nextcloud html+data, n8n, filebrowser, influxdb3) und Scope-Drift-/ Retention-/Compact-/Check-Alerts. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
234 lines
9.4 KiB
YAML
234 lines
9.4 KiB
YAML
groups:
|
|
- name: homelab-availability
|
|
rules:
|
|
- alert: HomelabExternalConnectivityDown
|
|
expr: sum(probe_success{job="blackbox-http"} == 0) >= 5
|
|
for: 8m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "External connectivity appears down"
|
|
description: "At least 5 public homelab endpoints are unreachable. Likely WAN, DNS, or provider issue."
|
|
|
|
- alert: HomelabEndpointDown
|
|
expr: (probe_success{job="blackbox-http"} == 0) unless on() (sum(probe_success{job="blackbox-http"} == 0) >= 5)
|
|
for: 8m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "{{ $labels.instance }} is not reachable"
|
|
description: "Blackbox probe failed for {{ $labels.instance }}."
|
|
|
|
- alert: HomelabEndpointSlow
|
|
expr: probe_duration_seconds{job="blackbox-http"} > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "{{ $labels.instance }} is slow"
|
|
description: "Blackbox probe duration is above 5 seconds for {{ $labels.instance }}."
|
|
|
|
- alert: HomelabCertificateExpiresSoon
|
|
expr: (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) < 21 * 24 * 3600 and (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) > 7 * 24 * 3600
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "TLS certificate expires soon for {{ $labels.instance }}"
|
|
description: "The earliest certificate expiry for {{ $labels.instance }} is below 21 days."
|
|
|
|
- alert: HomelabCertificateExpiresCritical
|
|
expr: (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) <= 7 * 24 * 3600
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "TLS certificate is close to expiry for {{ $labels.instance }}"
|
|
description: "The earliest certificate expiry for {{ $labels.instance }} is at or below 7 days, or already expired."
|
|
|
|
- name: homelab-host
|
|
rules:
|
|
- alert: HomelabDiskAlmostFull
|
|
expr: 100 * (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 85
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Disk usage high on {{ $labels.mountpoint }}"
|
|
description: "{{ $labels.mountpoint }} is above 85% used."
|
|
|
|
- alert: HomelabDiskCritical
|
|
expr: 100 * (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 95
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Disk critically full on {{ $labels.mountpoint }}"
|
|
description: "{{ $labels.mountpoint }} is above 95% used. Writes may start to fail (DB, appdata, cache)."
|
|
|
|
- alert: HomelabHighMemoryUsage
|
|
expr: 100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 90
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Memory usage high"
|
|
description: "Host memory usage is above 90%."
|
|
|
|
- alert: HomelabTraefik5xx
|
|
expr: sum(increase(traefik_service_requests_total{code=~"5.."}[5m])) by (service) >= 5
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Traefik 5xx responses for {{ $labels.service }}"
|
|
description: "Traefik reports at least 5 5xx responses for {{ $labels.service }} within 5 minutes."
|
|
|
|
- name: homelab-backup-and-containers
|
|
rules:
|
|
- alert: HomelabTextfileExporterStale
|
|
expr: time() - homelab_textfile_exporter_last_run_timestamp_seconds > 2 * 60 * 60
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Homelab textfile metrics are stale"
|
|
description: "The host textfile exporter has not refreshed metrics for more than 2 hours."
|
|
|
|
- alert: HomelabBorgMetricsMissing
|
|
expr: absent(homelab_borg_last_completed_timestamp_seconds)
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Borg backup metrics are missing"
|
|
description: "Prometheus cannot see the homelab_borg_last_completed_timestamp_seconds metric."
|
|
|
|
- alert: HomelabBorgBackupStale
|
|
expr: time() - homelab_borg_last_completed_timestamp_seconds > 30 * 60 * 60
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Borg backup is stale"
|
|
description: "The latest completed Borg backup is older than 30 hours."
|
|
|
|
- alert: HomelabBorgLastJobFailed
|
|
expr: homelab_borg_last_success != 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Latest Borg backup did not complete successfully"
|
|
description: "The latest Borg UI job status is {{ $labels.status }} for archive {{ $labels.archive }}."
|
|
|
|
- alert: HomelabBorgLastJobCompletedWithWarnings
|
|
expr: homelab_borg_last_job_warning == 1
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Latest Borg backup completed with warnings"
|
|
description: "The latest Borg UI job completed with warnings for archive {{ $labels.archive }}."
|
|
|
|
- alert: HomelabBorgScopeSourceListMissing
|
|
expr: homelab_borg_scope_expected_file_present != 1
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Borg expected source list is not visible"
|
|
description: "Borg UI cannot see the repo source list used for drift checks."
|
|
|
|
- alert: HomelabBorgScopeMissingSources
|
|
expr: homelab_borg_scope_missing_sources_total > 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Borg UI is missing expected backup sources"
|
|
description: "Borg UI is missing {{ $value }} source path(s) from ops/borg-ui/all-important-sources.txt."
|
|
|
|
- alert: HomelabBorgScopeExtraSources
|
|
expr: homelab_borg_scope_extra_sources_total > 0
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Borg UI has sources not tracked in the repo"
|
|
description: "Borg UI has {{ $value }} source path(s) that are not listed in ops/borg-ui/all-important-sources.txt."
|
|
|
|
- alert: HomelabBorgDumpMissing
|
|
expr: homelab_borg_dump_present == 0
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Borg pre-backup dump is missing: {{ $labels.dump }}"
|
|
description: "Expected dump artifact {{ $labels.dump }} is not present in the latest dump set. The pre-backup dump job may have failed or stopped."
|
|
|
|
- alert: HomelabBorgDumpStale
|
|
expr: homelab_borg_dump_age_seconds > 30 * 60 * 60
|
|
for: 15m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Borg pre-backup dump is stale: {{ $labels.dump }}"
|
|
description: "Dump artifact {{ $labels.dump }} is older than 30 hours. pre-backup-dumps.sh may have stopped; Borg would keep archiving stale database content without a job failure."
|
|
|
|
- alert: HomelabBorgRepositoryCheckStale
|
|
expr: time() - homelab_borg_repository_last_check_timestamp_seconds > 14 * 24 * 60 * 60
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Borg repository check is stale"
|
|
description: "Borg repository {{ $labels.repository }} has not had a recorded check for more than 14 days."
|
|
|
|
- alert: HomelabBorgRetentionDisabled
|
|
expr: homelab_borg_schedule_prune_after_enabled != 1
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Borg retention pruning is disabled"
|
|
description: "Scheduled Borg job {{ $labels.schedule }} does not run prune after backup."
|
|
|
|
- alert: HomelabBorgCompactDisabled
|
|
expr: homelab_borg_schedule_compact_after_enabled != 1
|
|
for: 30m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Borg compaction is disabled"
|
|
description: "Scheduled Borg job {{ $labels.schedule }} does not run compact after backup."
|
|
|
|
- alert: HomelabCriticalContainerDown
|
|
expr: homelab_critical_container_running == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Critical container is down: {{ $labels.name }}"
|
|
description: "The host textfile exporter reports that critical container {{ $labels.name }} is not running."
|
|
|
|
- alert: HomelabGitOpsRuntimeImageDrift
|
|
expr: homelab_gitops_runtime_image_match == 0
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Runtime image drift: {{ $labels.name }}"
|
|
description: "Container {{ $labels.name }} is not running the image declared by its Compose config in project {{ $labels.project }}."
|
|
|
|
- name: homelab-meta
|
|
rules:
|
|
- alert: HomelabPrometheusTargetDown
|
|
expr: up == 0
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Prometheus target down: {{ $labels.job }} / {{ $labels.instance }}"
|
|
description: "Scrape target {{ $labels.instance }} (job {{ $labels.job }}) is unreachable. Metrics from this target are silent — alerts built on them will not fire."
|