diff --git a/docs/REPO_MAP.md b/docs/REPO_MAP.md index 1f9b8d1..155e63d 100644 --- a/docs/REPO_MAP.md +++ b/docs/REPO_MAP.md @@ -109,7 +109,7 @@ Secret-Werte werden hier nicht dokumentiert. Aufgefuehrt werden nur Variablennam | Glances | `ops/glances/docker-compose.yml` | `glances` -> `nicolargo/glances:latest-full@sha256:...` | `glances.kaleschke.info` | `frontend_net` | keine | Rootfs/Docker-Socket fuer Monitoring | | Grafana/InfluxDB | `ops/grafana-influxdb/docker-compose.yml` | `grafana`, `influxdb3-core` | `grafana.kaleschke.info` | `frontend_net`, `grafana_influx_internal`, `grafana_influx_lan` | `influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | abgeloester Altstand; nach erfolgreicher Migration durch `monitoring/` ersetzen | | Loki/Alloy | `ops/loki/docker-compose.yml` | `loki`, `alloy` | keine | `backend_net` | keine | abgeloester Altstand; nach erfolgreicher Migration durch `monitoring-loki`/`monitoring-promtail` ersetzen | -| Monitoring | `monitoring/docker-compose.yml` | `monitoring-prometheus`, `monitoring-loki`, `monitoring-promtail`, `monitoring-grafana`, `monitoring-node-exporter`, `monitoring-cadvisor`, `monitoring-influxdb3-core`, optional `monitoring-grafana-dashboard-importer` | `monitoring.kaleschke.info` | `frontend_net`, `monitoring_net`, `monitoring_influx_lan` | `monitoring-influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | zentraler Zielstack fuer Prometheus/Loki/Grafana/InfluxDB; Promtail nutzt Docker socket read-only; Dashboard-Importer nur via `bootstrap`-Profil | +| Monitoring | `monitoring/docker-compose.yml` | `monitoring-prometheus`, `monitoring-blackbox-exporter`, `monitoring-loki`, `monitoring-promtail`, `monitoring-grafana`, `monitoring-node-exporter`, `monitoring-cadvisor`, `monitoring-influxdb3-core`, optional `monitoring-grafana-dashboard-importer` | `monitoring.kaleschke.info` | `frontend_net`, `monitoring_net`, `monitoring_influx_lan` | `monitoring-influxdb3-core`: `${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181` | zentraler Zielstack fuer Prometheus/Loki/Grafana/InfluxDB; Blackbox ersetzt Uptime-Kuma-Checks nach Parallelphase; Promtail nutzt Docker socket read-only; Dashboard-Importer nur via `bootstrap`-Profil | | Hermes Agent | `ops/hermes-agent/docker-compose.yml` | `hermes-gateway`, `hermes-dashboard` -> local build from Dockerfile | `hermes.kaleschke.info` via `${HERMES_DASHBOARD_HOST}` | `hermes_net`, dashboard zusaetzlich `frontend_net` | `8642` nur expose intern | SSH runner, Home Assistant optional, LLM provider env; Dashboard hinter Authelia | | Komodo | `ops/komodo/docker-compose.yml` | `komodo-core`, `komodo-mongo`, `komodo-periphery` | `komodo.kaleschke.info` | `frontend_net`, `komodo_net` | keine | Mongo, Docker socket, `/mnt/user/services` workspace mount, Gitea DNS override | | Scrutiny | `ops/scrutiny/docker-compose.yml` | `scrutiny` -> `ghcr.io/starosdev/scrutiny:latest-omnibus@sha256:...` | `scrutiny.kaleschke.info` | `frontend_net` | keine | `privileged: true`, device mounts fuer SMART | @@ -198,7 +198,7 @@ Secret-Werte werden hier nicht dokumentiert. Aufgefuehrt werden nur Variablennam | Uptime Kuma | `/mnt/user/appdata/uptime-kuma` | | Grafana/InfluxDB | `/mnt/user/appdata/grafana`, Grafana provisioning, `/mnt/user/appdata/influxdb3/data`, `/mnt/user/appdata/influxdb3/plugins` | | Loki/Alloy | `/mnt/user/appdata/loki/config`, `/mnt/user/appdata/loki/data`, `/mnt/user/appdata/alloy/config`, `/mnt/user/appdata/alloy/data` | -| Monitoring | named volumes `prometheus_data`, `loki_data`, `promtail_positions`, `grafana_data`; InfluxDB-Persistenz unter `/mnt/user/appdata/influxdb3/data` und `/mnt/user/appdata/influxdb3/plugins`; Provisioning im Repo unter `monitoring/grafana/provisioning` | +| Monitoring | named volumes `prometheus_data`, `loki_data`, `promtail_positions`, `grafana_data`; InfluxDB-Persistenz unter `/mnt/user/appdata/influxdb3/data` und `/mnt/user/appdata/influxdb3/plugins`; Provisioning im Repo unter `monitoring/grafana/provisioning`; Dashboards unter `monitoring/grafana/dashboards` | | Hermes Agent | `/mnt/user/appdata/hermes-agent/data`, `/mnt/user/appdata/hermes-agent/ssh`, SSH private key path | | Komodo | `komodo_keys`, `/mnt/user/appdata/komodo/core`, `/mnt/user/appdata/komodo/mongo`, `/mnt/user/appdata/komodo/periphery`, `/mnt/user/services` | diff --git a/docs/SERVICE_CATALOG.md b/docs/SERVICE_CATALOG.md index 8d1c782..d91891f 100644 --- a/docs/SERVICE_CATALOG.md +++ b/docs/SERVICE_CATALOG.md @@ -67,8 +67,9 @@ Secret-Werte sind nicht enthalten. Es werden nur Secret-Namen, Env-Key-Namen und | `code-server` | Web-Editor / Operations Workspace | `ops/code-server/docker-compose.yml` | `https://code.kaleschke.info` | Traefik + Authelia | `/mnt/user/appdata/code-server`, `/mnt/user/services/dev` | Tier 3 | ja + Authelia | Passwort ueber LSIO `FILE__PASSWORD`; Workspaces beachten | | `grafana` | abgeloester Altstand fuer Grafana/InfluxDB | `ops/grafana-influxdb/docker-compose.yml` | `https://grafana.kaleschke.info` | Traefik + Authelia, InfluxDB 3 Core | `/mnt/user/appdata/grafana`, Grafana provisioning | Tier 3, `grafana.sqlite` | ja + Authelia | Nicht parallel zum neuen `monitoring/`-Zielstack betreiben; bleibt vorerst als Rollback-/Migrationsreferenz | | `influxdb3-core` | abgeloester Altstand fuer Home-Assistant-Langzeitdaten | `ops/grafana-influxdb/docker-compose.yml` | LAN `8181` je `INFLUXDB_BIND_IP`, keine Public URL | Grafana, Home Assistant Writer | `/mnt/user/appdata/influxdb3/data`, `/mnt/user/appdata/influxdb3/plugins` | Tier 3 | nein | Nach erfolgreicher Migration durch `monitoring-influxdb3-core` ersetzen; alten Datenpfad nicht blind loeschen | -| `monitoring-grafana` | zentrale Observability-UI fuer Metriken, Logs und InfluxDB | `monitoring/docker-compose.yml` | `https://monitoring.kaleschke.info` | Traefik + Authelia, Prometheus, Loki, InfluxDB 3 Core | named volume `grafana_data`, Provisioning unter `monitoring/grafana/provisioning` | Tier 3, named volume | ja + Authelia | Admin-Passwort ueber `monitoring_grafana_admin_password.txt`; Dashboard-Importer ist optionales `bootstrap`-Profil | -| `monitoring-prometheus` | Metrik-Speicher fuer Homelab-Monitoring | `monitoring/docker-compose.yml`, `monitoring/prometheus/prometheus.yml` | intern `http://prometheus:9090` | `monitoring_net`, node-exporter, cAdvisor, Traefik-Metrics | named volume `prometheus_data` | Tier 3, transiente Metriken mit 30 Tagen Retention | nein | Scrapes: Prometheus, node-exporter, cAdvisor, Traefik `:8082` | +| `monitoring-grafana` | zentrale Observability-UI fuer Metriken, Logs und InfluxDB | `monitoring/docker-compose.yml` | `https://monitoring.kaleschke.info` | Traefik + Authelia, Prometheus, Loki, InfluxDB 3 Core | named volume `grafana_data`, Provisioning unter `monitoring/grafana/provisioning`, Dashboards unter `monitoring/grafana/dashboards` | Tier 3, named volume | ja + Authelia | Admin-Passwort ueber `monitoring_grafana_admin_password.txt`; Dashboards `Homelab / Availability`, `Homelab / Host Overview`, `Homelab / Containers + Logs`; Dashboard-Importer ist optionales `bootstrap`-Profil | +| `monitoring-prometheus` | Metrik-Speicher fuer Homelab-Monitoring | `monitoring/docker-compose.yml`, `monitoring/prometheus/prometheus.yml`, `monitoring/prometheus/alerts.yml` | intern `http://prometheus:9090` | `monitoring_net`, node-exporter, cAdvisor, Traefik-Metrics, Blackbox Exporter | named volume `prometheus_data` | Tier 3, transiente Metriken mit 30 Tagen Retention | nein | Scrapes: Prometheus, node-exporter, cAdvisor, Traefik `:8082`, `blackbox-http`; Alerts werden in Prometheus ausgewertet, Grafana-Contact-Point folgt separat | +| `monitoring-blackbox-exporter` | HTTP-Erreichbarkeitspruefungen fuer Uptime-Kuma-Abloesung | `monitoring/docker-compose.yml`, `monitoring/blackbox/blackbox.yml` | intern `:9115` | Prometheus, externe HTTPS-Ziele | kein kritischer Zustand | rebuildbar | nein | Uptime Kuma erst nach sieben Tagen Parallelbetrieb und Grafana-Alerting-Paritaet stoppen | | `monitoring-loki` | Logspeicher fuer Monitoring-Stack | `monitoring/docker-compose.yml`, `monitoring/loki/loki-config.yml` | intern `http://loki:3100` | `monitoring_net`, Promtail, Grafana | named volume `loki_data` | Tier 3, transiente Logs mit 30 Tagen Retention | nein | Von bestehendem `ops/loki` getrennt; Doppelbetrieb bewusst pruefen | | `monitoring-promtail` | Docker-Log-Collector fuer Monitoring-Loki | `monitoring/docker-compose.yml`, `monitoring/promtail/promtail-config.yml` | intern | Docker socket read-only, Docker json-file Logs, Loki | named volume `promtail_positions` | rebuildbar | nein | Dokumentierte Host-Observability-Ausnahme: `/var/run/docker.sock:/var/run/docker.sock:ro` und `/var/lib/docker/containers:ro`; keine Appdaten, nur Log-Discovery | | `monitoring-node-exporter` | Host-Metriken fuer Prometheus | `monitoring/docker-compose.yml` | intern `:9100` | Host `/proc`, `/sys`, `/` read-only, Prometheus | kein kritischer Zustand | rebuildbar | nein | Host-Observability-Ausnahme mit read-only Rootfs/Proc/Sys-Mounts | diff --git a/monitoring/README.md b/monitoring/README.md index b71239d..b90b1a7 100644 --- a/monitoring/README.md +++ b/monitoring/README.md @@ -10,6 +10,7 @@ Zielzustand: ein zentraler Observability-Stack fuer KalliLab CORE. - `monitoring-promtail`: Docker-Log-Discovery ueber read-only Docker-Socket - `monitoring-node-exporter`: Host-Metriken - `monitoring-cadvisor`: Container-Metriken +- `monitoring-blackbox-exporter`: externe HTTP-Erreichbarkeit als Uptime-Kuma-Ablösepfad - `monitoring-influxdb3-core`: InfluxDB 3 Core fuer Home-Assistant-/Ecowitt-Langzeitdaten Die alten Pfade `ops/loki` und `ops/grafana-influxdb` sind damit abgeloeste Altstaende. Sie bleiben vorerst im Repo als Rollback- und Migrationsreferenz, sollen aber nach erfolgreichem Live-Deploy nicht parallel betrieben werden. @@ -55,6 +56,12 @@ INFLUXDB_BIND_IP=192.168.178.58 - `https://monitoring.kaleschke.info` leitet zu Authelia. - Grafana-Datasources `Prometheus`, `Loki` und `InfluxDB 3 Core` testen erfolgreich. -- Prometheus Targets: `prometheus`, `node-exporter`, `cadvisor`, `traefik`. +- Prometheus Targets: `prometheus`, `node-exporter`, `cadvisor`, `traefik`, `blackbox-http`. - Loki zeigt Container-Logs mit Labels `container`, `compose_project`, `compose_service`. - InfluxDB 3 Core enthaelt die Datenbank `homelab`. + +## Ablösepfad + +- Dozzle bleibt abgeloest: `Homelab / Containers + Logs` ersetzt Live-Logs und Error-Rate. +- Glances erst stoppen, wenn `Homelab / Host Overview` und `Homelab / Containers + Logs` fuer CPU, RAM, Disk, Network, Container-CPU und Container-RAM passen. +- Uptime Kuma erst stoppen, wenn `Homelab / Availability` und Grafana-Alerting mindestens sieben Tage parallel sauber laufen. diff --git a/monitoring/blackbox/blackbox.yml b/monitoring/blackbox/blackbox.yml new file mode 100644 index 0000000..ac561ac --- /dev/null +++ b/monitoring/blackbox/blackbox.yml @@ -0,0 +1,14 @@ +modules: + http_2xx: + prober: http + timeout: 10s + http: + valid_http_versions: + - HTTP/1.1 + - HTTP/2.0 + follow_redirects: true + preferred_ip_protocol: ip4 + + tcp_connect: + prober: tcp + timeout: 5s diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 7b08824..ca64085 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -10,6 +10,7 @@ services: - --web.enable-lifecycle volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro + - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro - prometheus_data:/prometheus networks: - monitoring_net @@ -18,9 +19,25 @@ services: security_opt: - no-new-privileges:true depends_on: + - blackbox-exporter - node-exporter - cadvisor + blackbox-exporter: + image: prom/blackbox-exporter:v0.27.0 + container_name: monitoring-blackbox-exporter + restart: unless-stopped + command: + - --config.file=/etc/blackbox_exporter/blackbox.yml + volumes: + - ./blackbox/blackbox.yml:/etc/blackbox_exporter/blackbox.yml:ro + networks: + - monitoring_net + expose: + - "9115" + security_opt: + - no-new-privileges:true + loki: image: grafana/loki:3.7.2 container_name: monitoring-loki @@ -74,6 +91,7 @@ services: volumes: - grafana_data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning:ro + - ./grafana/dashboards:/var/lib/grafana/dashboards:ro networks: - monitoring_net - frontend_net diff --git a/monitoring/grafana/dashboards/availability.json b/monitoring/grafana/dashboards/availability.json new file mode 100644 index 0000000..040928b --- /dev/null +++ b/monitoring/grafana/dashboards/availability.json @@ -0,0 +1,145 @@ +{ + "uid": "homelab-availability", + "title": "Homelab / Availability", + "tags": ["homelab", "blackbox", "uptime"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-6h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "target", + "label": "Target", + "type": "query", + "datasource": "Prometheus", + "query": "label_values(probe_success{job=\"blackbox-http\"}, instance)", + "refresh": 1, + "includeAll": true, + "multi": true, + "allValue": ".+", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + } + } + ] + }, + "panels": [ + { + "id": 1, + "type": "stat", + "title": "Endpoints up", + "datasource": "Prometheus", + "gridPos": {"h": 5, "w": 6, "x": 0, "y": 0}, + "targets": [ + { + "refId": "A", + "expr": "sum(probe_success{job=\"blackbox-http\", instance=~\"${target:regex}\"})" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "green", "value": 1} + ] + } + }, + "overrides": [] + }, + "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}} + }, + { + "id": 2, + "type": "stat", + "title": "Endpoints down", + "datasource": "Prometheus", + "gridPos": {"h": 5, "w": 6, "x": 6, "y": 0}, + "targets": [ + { + "refId": "A", + "expr": "count(probe_success{job=\"blackbox-http\", instance=~\"${target:regex}\"}) - sum(probe_success{job=\"blackbox-http\", instance=~\"${target:regex}\"})" + } + ], + "fieldConfig": { + "defaults": { + "unit": "short", + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "red", "value": 1} + ] + } + }, + "overrides": [] + }, + "options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}} + }, + { + "id": 3, + "type": "timeseries", + "title": "Probe duration", + "datasource": "Prometheus", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "targets": [ + { + "refId": "A", + "expr": "probe_duration_seconds{job=\"blackbox-http\", instance=~\"${target:regex}\"}", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": {"defaults": {"unit": "s"}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}} + }, + { + "id": 4, + "type": "timeseries", + "title": "Availability", + "datasource": "Prometheus", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 5}, + "targets": [ + { + "refId": "A", + "expr": "probe_success{job=\"blackbox-http\", instance=~\"${target:regex}\"}", + "legendFormat": "{{instance}}" + } + ], + "fieldConfig": {"defaults": {"unit": "bool"}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}} + }, + { + "id": 5, + "type": "table", + "title": "HTTP status", + "datasource": "Prometheus", + "gridPos": {"h": 9, "w": 24, "x": 0, "y": 13}, + "targets": [ + { + "refId": "A", + "expr": "probe_http_status_code{job=\"blackbox-http\", instance=~\"${target:regex}\"}", + "format": "table", + "instant": true + } + ], + "transformations": [ + { + "id": "organize", + "options": { + "excludeByName": {"Time": true, "__name__": true, "job": true}, + "renameByName": {"Value": "status_code", "instance": "target"} + } + } + ] + } + ] +} diff --git a/monitoring/grafana/dashboards/containers-logs.json b/monitoring/grafana/dashboards/containers-logs.json new file mode 100644 index 0000000..96ba5df --- /dev/null +++ b/monitoring/grafana/dashboards/containers-logs.json @@ -0,0 +1,109 @@ +{ + "uid": "homelab-containers-logs", + "title": "Homelab / Containers + Logs", + "tags": ["homelab", "cadvisor", "loki", "dozzle-replacement"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-1h", + "to": "now" + }, + "templating": { + "list": [ + { + "name": "service", + "label": "Service", + "type": "query", + "datasource": "Loki", + "query": "label_values(compose_service)", + "refresh": 1, + "includeAll": true, + "multi": true, + "allValue": ".+", + "current": { + "selected": true, + "text": "All", + "value": "$__all" + } + }, + { + "name": "search", + "label": "Regex Match", + "type": "textbox", + "query": ".+", + "current": { + "selected": true, + "text": ".+", + "value": ".+" + } + } + ] + }, + "panels": [ + { + "id": 1, + "type": "timeseries", + "title": "Container CPU", + "datasource": "Prometheus", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "targets": [ + { + "refId": "A", + "expr": "sum by (name) (rate(container_cpu_usage_seconds_total{name!=\"\"}[5m]))", + "legendFormat": "{{name}}" + } + ], + "fieldConfig": {"defaults": {"unit": "cores"}, "overrides": []} + }, + { + "id": 2, + "type": "timeseries", + "title": "Container memory", + "datasource": "Prometheus", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "targets": [ + { + "refId": "A", + "expr": "sum by (name) (container_memory_working_set_bytes{name!=\"\"})", + "legendFormat": "{{name}}" + } + ], + "fieldConfig": {"defaults": {"unit": "bytes"}, "overrides": []} + }, + { + "id": 3, + "type": "timeseries", + "title": "Log error rate", + "datasource": "Loki", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 8}, + "targets": [ + { + "refId": "A", + "expr": "sum by (compose_service) (count_over_time({compose_service=~\"${service:regex}\", container=~\".+\"} |~ \"(?i)error|exception|panic|fatal|traceback|oom|killed\" [$__interval]))", + "legendFormat": "{{compose_service}}" + } + ] + }, + { + "id": 4, + "type": "logs", + "title": "Live logs", + "datasource": "Loki", + "gridPos": {"h": 14, "w": 24, "x": 0, "y": 16}, + "targets": [ + { + "refId": "A", + "expr": "{compose_service=~\"${service:regex}\", container=~\".+\"} |~ \"$search\"" + } + ], + "options": { + "showLabels": false, + "showTime": true, + "sortOrder": "Descending", + "wrapLogMessage": false + } + } + ] +} diff --git a/monitoring/grafana/dashboards/host-overview.json b/monitoring/grafana/dashboards/host-overview.json new file mode 100644 index 0000000..4441f70 --- /dev/null +++ b/monitoring/grafana/dashboards/host-overview.json @@ -0,0 +1,102 @@ +{ + "uid": "homelab-host-overview", + "title": "Homelab / Host Overview", + "tags": ["homelab", "node-exporter", "glances-replacement"], + "timezone": "browser", + "schemaVersion": 39, + "version": 1, + "refresh": "30s", + "time": { + "from": "now-6h", + "to": "now" + }, + "panels": [ + { + "id": 1, + "type": "timeseries", + "title": "CPU usage", + "datasource": "Prometheus", + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "targets": [ + { + "refId": "A", + "expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)", + "legendFormat": "CPU" + } + ], + "fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100}, "overrides": []} + }, + { + "id": 2, + "type": "timeseries", + "title": "Memory usage", + "datasource": "Prometheus", + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 0}, + "targets": [ + { + "refId": "A", + "expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))", + "legendFormat": "Memory" + } + ], + "fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100}, "overrides": []} + }, + { + "id": 3, + "type": "timeseries", + "title": "Filesystem usage", + "datasource": "Prometheus", + "gridPos": {"h": 9, "w": 12, "x": 0, "y": 8}, + "targets": [ + { + "refId": "A", + "expr": "100 * (1 - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay\"})", + "legendFormat": "{{mountpoint}}" + } + ], + "fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}} + }, + { + "id": 4, + "type": "timeseries", + "title": "Network throughput", + "datasource": "Prometheus", + "gridPos": {"h": 9, "w": 12, "x": 12, "y": 8}, + "targets": [ + { + "refId": "A", + "expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|br-.*|docker.*\"}[5m])", + "legendFormat": "{{device}} RX" + }, + { + "refId": "B", + "expr": "rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|br-.*|docker.*\"}[5m])", + "legendFormat": "{{device}} TX" + } + ], + "fieldConfig": {"defaults": {"unit": "Bps"}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}} + }, + { + "id": 5, + "type": "timeseries", + "title": "Disk IO", + "datasource": "Prometheus", + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 17}, + "targets": [ + { + "refId": "A", + "expr": "rate(node_disk_read_bytes_total[5m])", + "legendFormat": "{{device}} read" + }, + { + "refId": "B", + "expr": "rate(node_disk_written_bytes_total[5m])", + "legendFormat": "{{device}} write" + } + ], + "fieldConfig": {"defaults": {"unit": "Bps"}, "overrides": []} + } + ] +} diff --git a/monitoring/prometheus/alerts.yml b/monitoring/prometheus/alerts.yml new file mode 100644 index 0000000..8240c78 --- /dev/null +++ b/monitoring/prometheus/alerts.yml @@ -0,0 +1,49 @@ +groups: + - name: homelab-availability + rules: + - alert: HomelabEndpointDown + expr: probe_success{job="blackbox-http"} == 0 + for: 2m + labels: + severity: critical + annotations: + summary: "{{ $labels.instance }} is not reachable" + description: "Blackbox probe failed for {{ $labels.instance }}." + + - alert: HomelabEndpointSlow + expr: probe_duration_seconds{job="blackbox-http"} > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "{{ $labels.instance }} is slow" + description: "Blackbox probe duration is above 5 seconds for {{ $labels.instance }}." + + - name: homelab-host + rules: + - alert: HomelabDiskAlmostFull + expr: 100 * (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 85 + for: 10m + labels: + severity: warning + annotations: + summary: "Disk usage high on {{ $labels.mountpoint }}" + description: "{{ $labels.mountpoint }} is above 85% used." + + - alert: HomelabHighMemoryUsage + expr: 100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 90 + for: 10m + labels: + severity: warning + annotations: + summary: "Memory usage high" + description: "Host memory usage is above 90%." + + - alert: HomelabTraefik5xx + expr: sum(rate(traefik_service_requests_total{code=~"5.."}[5m])) by (service) > 0 + for: 2m + labels: + severity: warning + annotations: + summary: "Traefik 5xx responses for {{ $labels.service }}" + description: "Traefik reports 5xx responses for {{ $labels.service }}." diff --git a/monitoring/prometheus/prometheus.yml b/monitoring/prometheus/prometheus.yml index 5a533e9..7249485 100644 --- a/monitoring/prometheus/prometheus.yml +++ b/monitoring/prometheus/prometheus.yml @@ -4,6 +4,9 @@ global: external_labels: site: kallilabcore +rule_files: + - /etc/prometheus/alerts.yml + scrape_configs: - job_name: prometheus static_configs: @@ -26,3 +29,40 @@ scrape_configs: # Traefik exposes Prometheus metrics internally on its metrics entrypoint. - targets: - traefik:8082 + + - job_name: blackbox-http + metrics_path: /probe + params: + module: + - http_2xx + static_configs: + - targets: + - https://monitoring.kaleschke.info + - https://auth.kaleschke.info + - https://git.kaleschke.info + - https://komodo.kaleschke.info + - https://uptime.kaleschke.info + - https://home.kaleschke.info + - https://paperless.kaleschke.info + - https://paperless-gpt.kaleschke.info + - https://immich.kaleschke.info + - https://mealie.kaleschke.info + - https://vault.kaleschke.info + - https://cloud.kaleschke.info + - https://ntfy.kaleschke.info + - https://borg.kaleschke.info + - https://files.kaleschke.info + - https://code.kaleschke.info + - https://glances.kaleschke.info + - https://scrutiny.kaleschke.info + - https://speedtest.kaleschke.info + - https://pdf.kaleschke.info + relabel_configs: + - source_labels: + - __address__ + target_label: __param_target + - source_labels: + - __param_target + target_label: instance + - target_label: __address__ + replacement: blackbox-exporter:9115