Add monitoring replacement baseline
This commit is contained in:
@@ -0,0 +1,49 @@
|
||||
groups:
|
||||
- name: homelab-availability
|
||||
rules:
|
||||
- alert: HomelabEndpointDown
|
||||
expr: probe_success{job="blackbox-http"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} is not reachable"
|
||||
description: "Blackbox probe failed for {{ $labels.instance }}."
|
||||
|
||||
- alert: HomelabEndpointSlow
|
||||
expr: probe_duration_seconds{job="blackbox-http"} > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} is slow"
|
||||
description: "Blackbox probe duration is above 5 seconds for {{ $labels.instance }}."
|
||||
|
||||
- name: homelab-host
|
||||
rules:
|
||||
- alert: HomelabDiskAlmostFull
|
||||
expr: 100 * (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk usage high on {{ $labels.mountpoint }}"
|
||||
description: "{{ $labels.mountpoint }} is above 85% used."
|
||||
|
||||
- alert: HomelabHighMemoryUsage
|
||||
expr: 100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory usage high"
|
||||
description: "Host memory usage is above 90%."
|
||||
|
||||
- alert: HomelabTraefik5xx
|
||||
expr: sum(rate(traefik_service_requests_total{code=~"5.."}[5m])) by (service) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Traefik 5xx responses for {{ $labels.service }}"
|
||||
description: "Traefik reports 5xx responses for {{ $labels.service }}."
|
||||
@@ -4,6 +4,9 @@ global:
|
||||
external_labels:
|
||||
site: kallilabcore
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/alerts.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
@@ -26,3 +29,40 @@ scrape_configs:
|
||||
# Traefik exposes Prometheus metrics internally on its metrics entrypoint.
|
||||
- targets:
|
||||
- traefik:8082
|
||||
|
||||
- job_name: blackbox-http
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module:
|
||||
- http_2xx
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://monitoring.kaleschke.info
|
||||
- https://auth.kaleschke.info
|
||||
- https://git.kaleschke.info
|
||||
- https://komodo.kaleschke.info
|
||||
- https://uptime.kaleschke.info
|
||||
- https://home.kaleschke.info
|
||||
- https://paperless.kaleschke.info
|
||||
- https://paperless-gpt.kaleschke.info
|
||||
- https://immich.kaleschke.info
|
||||
- https://mealie.kaleschke.info
|
||||
- https://vault.kaleschke.info
|
||||
- https://cloud.kaleschke.info
|
||||
- https://ntfy.kaleschke.info
|
||||
- https://borg.kaleschke.info
|
||||
- https://files.kaleschke.info
|
||||
- https://code.kaleschke.info
|
||||
- https://glances.kaleschke.info
|
||||
- https://scrutiny.kaleschke.info
|
||||
- https://speedtest.kaleschke.info
|
||||
- https://pdf.kaleschke.info
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
- __address__
|
||||
target_label: __param_target
|
||||
- source_labels:
|
||||
- __param_target
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
Reference in New Issue
Block a user