groups: - name: homelab-availability rules: - alert: HomelabExternalConnectivityDown expr: sum(probe_success{job="blackbox-http"} == 0) >= 5 for: 8m labels: severity: warning annotations: summary: "External connectivity appears down" description: "At least 5 public homelab endpoints are unreachable. Likely WAN, DNS, or provider issue." - alert: HomelabEndpointDown expr: (probe_success{job="blackbox-http"} == 0) unless on() (sum(probe_success{job="blackbox-http"} == 0) >= 5) for: 8m labels: severity: critical annotations: summary: "{{ $labels.instance }} is not reachable" description: "Blackbox probe failed for {{ $labels.instance }}." - alert: HomelabEndpointSlow expr: probe_duration_seconds{job="blackbox-http"} > 5 for: 5m labels: severity: warning annotations: summary: "{{ $labels.instance }} is slow" description: "Blackbox probe duration is above 5 seconds for {{ $labels.instance }}." - alert: HomelabCertificateExpiresSoon expr: (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) < 21 * 24 * 3600 and (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) > 7 * 24 * 3600 for: 30m labels: severity: warning annotations: summary: "TLS certificate expires soon for {{ $labels.instance }}" description: "The earliest certificate expiry for {{ $labels.instance }} is below 21 days." - alert: HomelabCertificateExpiresCritical expr: (probe_ssl_earliest_cert_expiry{job="blackbox-http"} - time()) <= 7 * 24 * 3600 for: 15m labels: severity: critical annotations: summary: "TLS certificate is close to expiry for {{ $labels.instance }}" description: "The earliest certificate expiry for {{ $labels.instance }} is at or below 7 days, or already expired." - name: homelab-host rules: - alert: HomelabDiskAlmostFull expr: 100 * (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 85 for: 10m labels: severity: warning annotations: summary: "Disk usage high on {{ $labels.mountpoint }}" description: "{{ $labels.mountpoint }} is above 85% used." - alert: HomelabDiskCritical expr: 100 * (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 95 for: 5m labels: severity: critical annotations: summary: "Disk critically full on {{ $labels.mountpoint }}" description: "{{ $labels.mountpoint }} is above 95% used. Writes may start to fail (DB, appdata, cache)." - alert: HomelabHighMemoryUsage expr: 100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 90 for: 10m labels: severity: warning annotations: summary: "Memory usage high" description: "Host memory usage is above 90%." - alert: HomelabTraefik5xx expr: sum(increase(traefik_service_requests_total{code=~"5.."}[5m])) by (service) >= 5 for: 2m labels: severity: warning annotations: summary: "Traefik 5xx responses for {{ $labels.service }}" description: "Traefik reports at least 5 5xx responses for {{ $labels.service }} within 5 minutes." - name: homelab-backup-and-containers rules: - alert: HomelabTextfileExporterStale expr: time() - homelab_textfile_exporter_last_run_timestamp_seconds > 2 * 60 * 60 for: 15m labels: severity: warning annotations: summary: "Homelab textfile metrics are stale" description: "The host textfile exporter has not refreshed metrics for more than 2 hours." - alert: HomelabBorgMetricsMissing expr: absent(homelab_borg_last_completed_timestamp_seconds) for: 15m labels: severity: critical annotations: summary: "Borg backup metrics are missing" description: "Prometheus cannot see the homelab_borg_last_completed_timestamp_seconds metric." - alert: HomelabBorgBackupStale expr: time() - homelab_borg_last_completed_timestamp_seconds > 30 * 60 * 60 for: 15m labels: severity: warning annotations: summary: "Borg backup is stale" description: "The latest completed Borg backup is older than 30 hours." - alert: HomelabBorgLastJobFailed expr: homelab_borg_last_success != 1 for: 15m labels: severity: critical annotations: summary: "Latest Borg backup did not complete successfully" description: "The latest Borg UI job status is {{ $labels.status }} for archive {{ $labels.archive }}." - alert: HomelabBorgLastJobCompletedWithWarnings expr: homelab_borg_last_job_warning == 1 for: 15m labels: severity: warning annotations: summary: "Latest Borg backup completed with warnings" description: "The latest Borg UI job completed with warnings for archive {{ $labels.archive }}." - alert: HomelabCriticalContainerDown expr: homelab_critical_container_running == 0 for: 5m labels: severity: critical annotations: summary: "Critical container is down: {{ $labels.name }}" description: "The host textfile exporter reports that critical container {{ $labels.name }} is not running." - alert: HomelabGitOpsRuntimeImageDrift expr: homelab_gitops_runtime_image_match == 0 for: 10m labels: severity: warning annotations: summary: "Runtime image drift: {{ $labels.name }}" description: "Container {{ $labels.name }} is not running the image declared by its Compose config in project {{ $labels.project }}." - name: homelab-meta rules: - alert: HomelabPrometheusTargetDown expr: up == 0 for: 5m labels: severity: critical annotations: summary: "Prometheus target down: {{ $labels.job }} / {{ $labels.instance }}" description: "Scrape target {{ $labels.instance }} (job {{ $labels.job }}) is unreachable. Metrics from this target are silent — alerts built on them will not fire."