Prepare monitoring alert rules
This commit is contained in:
+110
@@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
TEXTFILE_DIR="${TEXTFILE_DIR:-/mnt/user/services/posture-check/textfile}"
|
||||
OUTPUT_FILE="${OUTPUT_FILE:-$TEXTFILE_DIR/homelab.prom}"
|
||||
BORG_CONTAINER="${BORG_CONTAINER:-borg-ui}"
|
||||
CRITICAL_CONTAINERS="${CRITICAL_CONTAINERS:-traefik authelia postgresql17 gitea komodo-core komodo-mongo komodo-periphery vaultwarden borg-ui ntfy adguard unbound Tailscale-Docker monitoring-alertmanager monitoring-alertmanager-ntfy-bridge monitoring-blackbox-exporter monitoring-cadvisor monitoring-grafana monitoring-loki monitoring-node-exporter monitoring-promtail immich_server immich_postgres immich_redis paperless-ngx nextcloud nextcloud-postgres nextcloud-redis mealie mealie-postgres}"
|
||||
|
||||
mkdir -p "$TEXTFILE_DIR"
|
||||
tmp="$(mktemp "$TEXTFILE_DIR/homelab.prom.XXXXXX")"
|
||||
cleanup() {
|
||||
rm -f "$tmp"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
now="$(date +%s)"
|
||||
|
||||
{
|
||||
cat <<'EOF'
|
||||
# HELP homelab_textfile_exporter_last_run_timestamp_seconds Unix timestamp of the last successful homelab textfile exporter run.
|
||||
# TYPE homelab_textfile_exporter_last_run_timestamp_seconds gauge
|
||||
EOF
|
||||
printf 'homelab_textfile_exporter_last_run_timestamp_seconds %s\n' "$now"
|
||||
|
||||
cat <<'EOF'
|
||||
# HELP homelab_critical_container_running Whether a critical container is currently running according to docker inspect.
|
||||
# TYPE homelab_critical_container_running gauge
|
||||
EOF
|
||||
for container in $CRITICAL_CONTAINERS; do
|
||||
running="0"
|
||||
if docker inspect -f '{{.State.Running}}' "$container" 2>/dev/null | grep -qx true; then
|
||||
running="1"
|
||||
fi
|
||||
printf 'homelab_critical_container_running{name="%s"} %s\n' "$container" "$running"
|
||||
done
|
||||
|
||||
cat <<'EOF'
|
||||
# HELP homelab_borg_last_completed_timestamp_seconds Unix timestamp of the most recent completed Borg backup job known to Borg UI.
|
||||
# TYPE homelab_borg_last_completed_timestamp_seconds gauge
|
||||
# HELP homelab_borg_last_success Whether the most recent Borg backup job completed successfully.
|
||||
# TYPE homelab_borg_last_success gauge
|
||||
# HELP homelab_borg_last_job_warning Whether the most recent Borg backup job completed with warnings.
|
||||
# TYPE homelab_borg_last_job_warning gauge
|
||||
EOF
|
||||
|
||||
if docker inspect "$BORG_CONTAINER" >/dev/null 2>&1; then
|
||||
docker exec -i "$BORG_CONTAINER" python3 - <<'PY'
|
||||
import datetime as dt
|
||||
import sqlite3
|
||||
|
||||
conn = sqlite3.connect("/data/borg.db")
|
||||
conn.row_factory = sqlite3.Row
|
||||
cur = conn.cursor()
|
||||
|
||||
latest = cur.execute("""
|
||||
select status, completed_at, archive_name
|
||||
from backup_jobs
|
||||
order by coalesce(started_at, created_at) desc
|
||||
limit 1
|
||||
""").fetchone()
|
||||
|
||||
completed = cur.execute("""
|
||||
select completed_at, archive_name
|
||||
from backup_jobs
|
||||
where status in ('completed', 'completed_with_warnings')
|
||||
and completed_at is not null
|
||||
order by completed_at desc
|
||||
limit 1
|
||||
""").fetchone()
|
||||
|
||||
def parse_ts(value):
|
||||
if not value:
|
||||
return 0
|
||||
value = value.replace("Z", "+00:00")
|
||||
try:
|
||||
parsed = dt.datetime.fromisoformat(value)
|
||||
except ValueError:
|
||||
try:
|
||||
parsed = dt.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
|
||||
except ValueError:
|
||||
return 0
|
||||
if parsed.tzinfo is None:
|
||||
parsed = parsed.replace(tzinfo=dt.timezone.utc)
|
||||
return int(parsed.timestamp())
|
||||
|
||||
def escape_label(value):
|
||||
return (value or "").replace("\\", "\\\\").replace('"', '\\"')
|
||||
|
||||
latest_status = latest["status"] if latest else "missing"
|
||||
latest_success = 1 if latest_status in ("completed", "completed_with_warnings") else 0
|
||||
latest_warning = 1 if latest_status == "completed_with_warnings" else 0
|
||||
completed_ts = parse_ts(completed["completed_at"]) if completed else 0
|
||||
latest_archive = escape_label(latest["archive_name"] if latest else "")
|
||||
completed_archive = escape_label(completed["archive_name"] if completed else "")
|
||||
|
||||
print(f'homelab_borg_last_success{{status="{latest_status}",archive="{latest_archive}"}} {latest_success}')
|
||||
print(f'homelab_borg_last_job_warning{{status="{latest_status}",archive="{latest_archive}"}} {latest_warning}')
|
||||
print(f'homelab_borg_last_completed_timestamp_seconds{{archive="{completed_archive}"}} {completed_ts}')
|
||||
PY
|
||||
else
|
||||
printf 'homelab_borg_last_success{status="container_missing",archive=""} 0\n'
|
||||
printf 'homelab_borg_last_job_warning{status="container_missing",archive=""} 0\n'
|
||||
printf 'homelab_borg_last_completed_timestamp_seconds{archive=""} 0\n'
|
||||
fi
|
||||
} > "$tmp"
|
||||
|
||||
mv "$tmp" "$OUTPUT_FILE"
|
||||
trap - EXIT
|
||||
|
||||
printf '%s\n' "$OUTPUT_FILE"
|
||||
@@ -42,6 +42,27 @@ Zeit: taeglich 06:20, Cron `20 6 * * *`.
|
||||
bash /mnt/user/services/homelab-infra/services/posture-check/compose-runtime-drift.sh
|
||||
```
|
||||
|
||||
## `prometheus-textfile-export-15min`
|
||||
|
||||
Zeit: alle 15 Minuten, Cron `*/15 * * * *`.
|
||||
|
||||
Zweck:
|
||||
|
||||
- Borg-Backup-Frische fuer Prometheus sichtbar machen
|
||||
- kritische Container als explizite 0/1-Metrik exportieren
|
||||
- Grundlage fuer `HomelabBorgBackupStale`, `HomelabBorgLastJobFailed` und `HomelabCriticalContainerDown`
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
bash /mnt/user/services/homelab-infra/services/posture-check/export-prometheus-textfile.sh
|
||||
```
|
||||
|
||||
Ziel-Datei:
|
||||
|
||||
```text
|
||||
/mnt/user/services/posture-check/textfile/homelab.prom
|
||||
```
|
||||
|
||||
## `homelab-operations-report-daily`
|
||||
|
||||
Zeit: taeglich nach Borg und den Morgenchecks, z. B. 07:30, Cron `30 7 * * *`.
|
||||
|
||||
Reference in New Issue
Block a user