Tune homelab availability alerts
This commit is contained in:
@@ -1,9 +1,18 @@
|
||||
groups:
|
||||
- name: homelab-availability
|
||||
rules:
|
||||
- alert: HomelabExternalConnectivityDown
|
||||
expr: sum(probe_success{job="blackbox-http"} == 0) >= 5
|
||||
for: 8m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "External connectivity appears down"
|
||||
description: "At least 5 public homelab endpoints are unreachable. Likely WAN, DNS, or provider issue."
|
||||
|
||||
- alert: HomelabEndpointDown
|
||||
expr: probe_success{job="blackbox-http"} == 0
|
||||
for: 2m
|
||||
expr: (probe_success{job="blackbox-http"} == 0) unless on() (sum(probe_success{job="blackbox-http"} == 0) >= 5)
|
||||
for: 8m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
@@ -40,10 +49,10 @@ groups:
|
||||
description: "Host memory usage is above 90%."
|
||||
|
||||
- alert: HomelabTraefik5xx
|
||||
expr: sum(rate(traefik_service_requests_total{code=~"5.."}[5m])) by (service) > 0
|
||||
expr: sum(increase(traefik_service_requests_total{code=~"5.."}[5m])) by (service) >= 5
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Traefik 5xx responses for {{ $labels.service }}"
|
||||
description: "Traefik reports 5xx responses for {{ $labels.service }}."
|
||||
description: "Traefik reports at least 5 5xx responses for {{ $labels.service }} within 5 minutes."
|
||||
|
||||
Reference in New Issue
Block a user