Add monitoring replacement baseline
This commit is contained in:
@@ -10,6 +10,7 @@ Zielzustand: ein zentraler Observability-Stack fuer KalliLab CORE.
|
||||
- `monitoring-promtail`: Docker-Log-Discovery ueber read-only Docker-Socket
|
||||
- `monitoring-node-exporter`: Host-Metriken
|
||||
- `monitoring-cadvisor`: Container-Metriken
|
||||
- `monitoring-blackbox-exporter`: externe HTTP-Erreichbarkeit als Uptime-Kuma-Ablösepfad
|
||||
- `monitoring-influxdb3-core`: InfluxDB 3 Core fuer Home-Assistant-/Ecowitt-Langzeitdaten
|
||||
|
||||
Die alten Pfade `ops/loki` und `ops/grafana-influxdb` sind damit abgeloeste Altstaende. Sie bleiben vorerst im Repo als Rollback- und Migrationsreferenz, sollen aber nach erfolgreichem Live-Deploy nicht parallel betrieben werden.
|
||||
@@ -55,6 +56,12 @@ INFLUXDB_BIND_IP=192.168.178.58
|
||||
|
||||
- `https://monitoring.kaleschke.info` leitet zu Authelia.
|
||||
- Grafana-Datasources `Prometheus`, `Loki` und `InfluxDB 3 Core` testen erfolgreich.
|
||||
- Prometheus Targets: `prometheus`, `node-exporter`, `cadvisor`, `traefik`.
|
||||
- Prometheus Targets: `prometheus`, `node-exporter`, `cadvisor`, `traefik`, `blackbox-http`.
|
||||
- Loki zeigt Container-Logs mit Labels `container`, `compose_project`, `compose_service`.
|
||||
- InfluxDB 3 Core enthaelt die Datenbank `homelab`.
|
||||
|
||||
## Ablösepfad
|
||||
|
||||
- Dozzle bleibt abgeloest: `Homelab / Containers + Logs` ersetzt Live-Logs und Error-Rate.
|
||||
- Glances erst stoppen, wenn `Homelab / Host Overview` und `Homelab / Containers + Logs` fuer CPU, RAM, Disk, Network, Container-CPU und Container-RAM passen.
|
||||
- Uptime Kuma erst stoppen, wenn `Homelab / Availability` und Grafana-Alerting mindestens sieben Tage parallel sauber laufen.
|
||||
|
||||
@@ -0,0 +1,14 @@
|
||||
modules:
|
||||
http_2xx:
|
||||
prober: http
|
||||
timeout: 10s
|
||||
http:
|
||||
valid_http_versions:
|
||||
- HTTP/1.1
|
||||
- HTTP/2.0
|
||||
follow_redirects: true
|
||||
preferred_ip_protocol: ip4
|
||||
|
||||
tcp_connect:
|
||||
prober: tcp
|
||||
timeout: 5s
|
||||
@@ -10,6 +10,7 @@ services:
|
||||
- --web.enable-lifecycle
|
||||
volumes:
|
||||
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
networks:
|
||||
- monitoring_net
|
||||
@@ -18,9 +19,25 @@ services:
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
depends_on:
|
||||
- blackbox-exporter
|
||||
- node-exporter
|
||||
- cadvisor
|
||||
|
||||
blackbox-exporter:
|
||||
image: prom/blackbox-exporter:v0.27.0
|
||||
container_name: monitoring-blackbox-exporter
|
||||
restart: unless-stopped
|
||||
command:
|
||||
- --config.file=/etc/blackbox_exporter/blackbox.yml
|
||||
volumes:
|
||||
- ./blackbox/blackbox.yml:/etc/blackbox_exporter/blackbox.yml:ro
|
||||
networks:
|
||||
- monitoring_net
|
||||
expose:
|
||||
- "9115"
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
|
||||
loki:
|
||||
image: grafana/loki:3.7.2
|
||||
container_name: monitoring-loki
|
||||
@@ -74,6 +91,7 @@ services:
|
||||
volumes:
|
||||
- grafana_data:/var/lib/grafana
|
||||
- ./grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- ./grafana/dashboards:/var/lib/grafana/dashboards:ro
|
||||
networks:
|
||||
- monitoring_net
|
||||
- frontend_net
|
||||
|
||||
@@ -0,0 +1,145 @@
|
||||
{
|
||||
"uid": "homelab-availability",
|
||||
"title": "Homelab / Availability",
|
||||
"tags": ["homelab", "blackbox", "uptime"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "target",
|
||||
"label": "Target",
|
||||
"type": "query",
|
||||
"datasource": "Prometheus",
|
||||
"query": "label_values(probe_success{job=\"blackbox-http\"}, instance)",
|
||||
"refresh": 1,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".+",
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "stat",
|
||||
"title": "Endpoints up",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 5, "w": 6, "x": 0, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum(probe_success{job=\"blackbox-http\", instance=~\"${target:regex}\"})"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "red", "value": null},
|
||||
{"color": "green", "value": 1}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "stat",
|
||||
"title": "Endpoints down",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 5, "w": 6, "x": 6, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "count(probe_success{job=\"blackbox-http\", instance=~\"${target:regex}\"}) - sum(probe_success{job=\"blackbox-http\", instance=~\"${target:regex}\"})"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {
|
||||
"defaults": {
|
||||
"unit": "short",
|
||||
"thresholds": {
|
||||
"mode": "absolute",
|
||||
"steps": [
|
||||
{"color": "green", "value": null},
|
||||
{"color": "red", "value": 1}
|
||||
]
|
||||
}
|
||||
},
|
||||
"overrides": []
|
||||
},
|
||||
"options": {"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "timeseries",
|
||||
"title": "Probe duration",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "probe_duration_seconds{job=\"blackbox-http\", instance=~\"${target:regex}\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "s"}, "overrides": []},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom"}}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "timeseries",
|
||||
"title": "Availability",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 5},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "probe_success{job=\"blackbox-http\", instance=~\"${target:regex}\"}",
|
||||
"legendFormat": "{{instance}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "bool"}, "overrides": []},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom"}}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "table",
|
||||
"title": "HTTP status",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 9, "w": 24, "x": 0, "y": 13},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "probe_http_status_code{job=\"blackbox-http\", instance=~\"${target:regex}\"}",
|
||||
"format": "table",
|
||||
"instant": true
|
||||
}
|
||||
],
|
||||
"transformations": [
|
||||
{
|
||||
"id": "organize",
|
||||
"options": {
|
||||
"excludeByName": {"Time": true, "__name__": true, "job": true},
|
||||
"renameByName": {"Value": "status_code", "instance": "target"}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,109 @@
|
||||
{
|
||||
"uid": "homelab-containers-logs",
|
||||
"title": "Homelab / Containers + Logs",
|
||||
"tags": ["homelab", "cadvisor", "loki", "dozzle-replacement"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-1h",
|
||||
"to": "now"
|
||||
},
|
||||
"templating": {
|
||||
"list": [
|
||||
{
|
||||
"name": "service",
|
||||
"label": "Service",
|
||||
"type": "query",
|
||||
"datasource": "Loki",
|
||||
"query": "label_values(compose_service)",
|
||||
"refresh": 1,
|
||||
"includeAll": true,
|
||||
"multi": true,
|
||||
"allValue": ".+",
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": "All",
|
||||
"value": "$__all"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "search",
|
||||
"label": "Regex Match",
|
||||
"type": "textbox",
|
||||
"query": ".+",
|
||||
"current": {
|
||||
"selected": true,
|
||||
"text": ".+",
|
||||
"value": ".+"
|
||||
}
|
||||
}
|
||||
]
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "timeseries",
|
||||
"title": "Container CPU",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by (name) (rate(container_cpu_usage_seconds_total{name!=\"\"}[5m]))",
|
||||
"legendFormat": "{{name}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "cores"}, "overrides": []}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "Container memory",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by (name) (container_memory_working_set_bytes{name!=\"\"})",
|
||||
"legendFormat": "{{name}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "bytes"}, "overrides": []}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "timeseries",
|
||||
"title": "Log error rate",
|
||||
"datasource": "Loki",
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "sum by (compose_service) (count_over_time({compose_service=~\"${service:regex}\", container=~\".+\"} |~ \"(?i)error|exception|panic|fatal|traceback|oom|killed\" [$__interval]))",
|
||||
"legendFormat": "{{compose_service}}"
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "logs",
|
||||
"title": "Live logs",
|
||||
"datasource": "Loki",
|
||||
"gridPos": {"h": 14, "w": 24, "x": 0, "y": 16},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "{compose_service=~\"${service:regex}\", container=~\".+\"} |~ \"$search\""
|
||||
}
|
||||
],
|
||||
"options": {
|
||||
"showLabels": false,
|
||||
"showTime": true,
|
||||
"sortOrder": "Descending",
|
||||
"wrapLogMessage": false
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,102 @@
|
||||
{
|
||||
"uid": "homelab-host-overview",
|
||||
"title": "Homelab / Host Overview",
|
||||
"tags": ["homelab", "node-exporter", "glances-replacement"],
|
||||
"timezone": "browser",
|
||||
"schemaVersion": 39,
|
||||
"version": 1,
|
||||
"refresh": "30s",
|
||||
"time": {
|
||||
"from": "now-6h",
|
||||
"to": "now"
|
||||
},
|
||||
"panels": [
|
||||
{
|
||||
"id": 1,
|
||||
"type": "timeseries",
|
||||
"title": "CPU usage",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "100 - (avg(rate(node_cpu_seconds_total{mode=\"idle\"}[5m])) * 100)",
|
||||
"legendFormat": "CPU"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100}, "overrides": []}
|
||||
},
|
||||
{
|
||||
"id": 2,
|
||||
"type": "timeseries",
|
||||
"title": "Memory usage",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 0},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "100 * (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))",
|
||||
"legendFormat": "Memory"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100}, "overrides": []}
|
||||
},
|
||||
{
|
||||
"id": 3,
|
||||
"type": "timeseries",
|
||||
"title": "Filesystem usage",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 9, "w": 12, "x": 0, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "100 * (1 - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay\"} / node_filesystem_size_bytes{fstype!~\"tmpfs|overlay\"})",
|
||||
"legendFormat": "{{mountpoint}}"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "percent", "min": 0, "max": 100}, "overrides": []},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom"}}
|
||||
},
|
||||
{
|
||||
"id": 4,
|
||||
"type": "timeseries",
|
||||
"title": "Network throughput",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 9, "w": 12, "x": 12, "y": 8},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "rate(node_network_receive_bytes_total{device!~\"lo|veth.*|br-.*|docker.*\"}[5m])",
|
||||
"legendFormat": "{{device}} RX"
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"expr": "rate(node_network_transmit_bytes_total{device!~\"lo|veth.*|br-.*|docker.*\"}[5m])",
|
||||
"legendFormat": "{{device}} TX"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "Bps"}, "overrides": []},
|
||||
"options": {"legend": {"displayMode": "list", "placement": "bottom"}}
|
||||
},
|
||||
{
|
||||
"id": 5,
|
||||
"type": "timeseries",
|
||||
"title": "Disk IO",
|
||||
"datasource": "Prometheus",
|
||||
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 17},
|
||||
"targets": [
|
||||
{
|
||||
"refId": "A",
|
||||
"expr": "rate(node_disk_read_bytes_total[5m])",
|
||||
"legendFormat": "{{device}} read"
|
||||
},
|
||||
{
|
||||
"refId": "B",
|
||||
"expr": "rate(node_disk_written_bytes_total[5m])",
|
||||
"legendFormat": "{{device}} write"
|
||||
}
|
||||
],
|
||||
"fieldConfig": {"defaults": {"unit": "Bps"}, "overrides": []}
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,49 @@
|
||||
groups:
|
||||
- name: homelab-availability
|
||||
rules:
|
||||
- alert: HomelabEndpointDown
|
||||
expr: probe_success{job="blackbox-http"} == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} is not reachable"
|
||||
description: "Blackbox probe failed for {{ $labels.instance }}."
|
||||
|
||||
- alert: HomelabEndpointSlow
|
||||
expr: probe_duration_seconds{job="blackbox-http"} > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "{{ $labels.instance }} is slow"
|
||||
description: "Blackbox probe duration is above 5 seconds for {{ $labels.instance }}."
|
||||
|
||||
- name: homelab-host
|
||||
rules:
|
||||
- alert: HomelabDiskAlmostFull
|
||||
expr: 100 * (1 - node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Disk usage high on {{ $labels.mountpoint }}"
|
||||
description: "{{ $labels.mountpoint }} is above 85% used."
|
||||
|
||||
- alert: HomelabHighMemoryUsage
|
||||
expr: 100 * (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 90
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Memory usage high"
|
||||
description: "Host memory usage is above 90%."
|
||||
|
||||
- alert: HomelabTraefik5xx
|
||||
expr: sum(rate(traefik_service_requests_total{code=~"5.."}[5m])) by (service) > 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "Traefik 5xx responses for {{ $labels.service }}"
|
||||
description: "Traefik reports 5xx responses for {{ $labels.service }}."
|
||||
@@ -4,6 +4,9 @@ global:
|
||||
external_labels:
|
||||
site: kallilabcore
|
||||
|
||||
rule_files:
|
||||
- /etc/prometheus/alerts.yml
|
||||
|
||||
scrape_configs:
|
||||
- job_name: prometheus
|
||||
static_configs:
|
||||
@@ -26,3 +29,40 @@ scrape_configs:
|
||||
# Traefik exposes Prometheus metrics internally on its metrics entrypoint.
|
||||
- targets:
|
||||
- traefik:8082
|
||||
|
||||
- job_name: blackbox-http
|
||||
metrics_path: /probe
|
||||
params:
|
||||
module:
|
||||
- http_2xx
|
||||
static_configs:
|
||||
- targets:
|
||||
- https://monitoring.kaleschke.info
|
||||
- https://auth.kaleschke.info
|
||||
- https://git.kaleschke.info
|
||||
- https://komodo.kaleschke.info
|
||||
- https://uptime.kaleschke.info
|
||||
- https://home.kaleschke.info
|
||||
- https://paperless.kaleschke.info
|
||||
- https://paperless-gpt.kaleschke.info
|
||||
- https://immich.kaleschke.info
|
||||
- https://mealie.kaleschke.info
|
||||
- https://vault.kaleschke.info
|
||||
- https://cloud.kaleschke.info
|
||||
- https://ntfy.kaleschke.info
|
||||
- https://borg.kaleschke.info
|
||||
- https://files.kaleschke.info
|
||||
- https://code.kaleschke.info
|
||||
- https://glances.kaleschke.info
|
||||
- https://scrutiny.kaleschke.info
|
||||
- https://speedtest.kaleschke.info
|
||||
- https://pdf.kaleschke.info
|
||||
relabel_configs:
|
||||
- source_labels:
|
||||
- __address__
|
||||
target_label: __param_target
|
||||
- source_labels:
|
||||
- __param_target
|
||||
target_label: instance
|
||||
- target_label: __address__
|
||||
replacement: blackbox-exporter:9115
|
||||
|
||||
Reference in New Issue
Block a user