bc9ace315a
Findings aus dem Backup-/Restore-Audit 2026-06-18 umgesetzt: - Dump-Frische als Prometheus-Metrik (homelab_borg_dump_present / homelab_borg_dump_age_seconds) im Host-Exporter; schliesst den Blindfleck, dass Borg weiterlaeuft und stale Dumps archiviert, ohne Job-Fehler. - Neue Alerts HomelabBorgDumpMissing / HomelabBorgDumpStale (critical) plus ALERT_RULES.md. - Freshness-Gate (.sh + .ps1) und H:-Nearline-Pull um n8n.sqlite.dump und postgresql17-globals.sql ergaenzt. - Critical-Container-Watch um mail-archiver, n8n, homeassistant, smarthome-mosquitto erweitert. - BACKUP_SCOPE: /mnt/user/projekte und sonstige User-Shares ausserhalb App-Scope als bewusste offene Operator-Entscheidung dokumentiert; Hermes-data-Pfad als geparkt klargestellt. - MASTER_TODO: Nearline-Pull-Ueberwachung, Host-Pull-Nachzug und projekte-Scope-Entscheidung aufgenommen. Enthaelt ausserdem die zuvor vorbereiteten Scope-Erweiterungen (nextcloud html+data, n8n, filebrowser, influxdb3) und Scope-Drift-/ Retention-/Compact-/Check-Alerts. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
286 lines
12 KiB
Bash
Executable File
286 lines
12 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
TEXTFILE_DIR="${TEXTFILE_DIR:-/mnt/user/services/posture-check/textfile}"
|
|
OUTPUT_FILE="${OUTPUT_FILE:-$TEXTFILE_DIR/homelab.prom}"
|
|
BORG_CONTAINER="${BORG_CONTAINER:-borg-ui}"
|
|
BORG_EXPECTED_SOURCES_FILE="${BORG_EXPECTED_SOURCES_FILE:-/local/services/homelab-infra/ops/borg-ui/all-important-sources.txt}"
|
|
# Host-Pfad der aktuellen Dump-Artefakte (pre-backup-dumps.sh schreibt hierhin).
|
|
# Wird host-seitig gestattet; der Exporter laeuft als Unraid User Script.
|
|
BORG_DUMP_DIR="${BORG_DUMP_DIR:-/mnt/user/backups/borg/dumps/latest}"
|
|
CRITICAL_CONTAINERS="${CRITICAL_CONTAINERS:-traefik authelia postgresql17 gitea komodo-core komodo-mongo komodo-periphery vaultwarden borg-ui ntfy adguard unbound monitoring-alertmanager monitoring-alertmanager-ntfy-bridge monitoring-blackbox-exporter monitoring-cadvisor monitoring-grafana monitoring-loki monitoring-node-exporter monitoring-promtail immich_server immich_postgres immich_redis paperless-ngx nextcloud nextcloud-postgres nextcloud-redis mealie mealie-postgres mail-archiver n8n homeassistant smarthome-mosquitto}"
|
|
# Hinweis: Tailscale laeuft als natives Unraid-Plugin (kein Docker-Container) und
|
|
# wird daher hier bewusst NICHT als kritischer Container gefuehrt (Stand 2026-06-06).
|
|
|
|
mkdir -p "$TEXTFILE_DIR"
|
|
tmp="$(mktemp "$TEXTFILE_DIR/homelab.prom.XXXXXX")"
|
|
cleanup() {
|
|
rm -f "$tmp"
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
now="$(date +%s)"
|
|
|
|
emit_gitops_runtime_image_metrics() {
|
|
cat <<'EOF'
|
|
# HELP homelab_gitops_runtime_image_match Whether a running Compose container uses the image currently declared by its Compose config.
|
|
# TYPE homelab_gitops_runtime_image_match gauge
|
|
EOF
|
|
|
|
if ! command -v jq >/dev/null 2>&1; then
|
|
return
|
|
fi
|
|
|
|
docker ps \
|
|
--filter label=com.docker.compose.project \
|
|
--format '{{.Names}}\t{{.Label "com.docker.compose.project"}}\t{{.Label "com.docker.compose.service"}}\t{{.Label "com.docker.compose.project.config_files"}}\t{{.Label "com.docker.compose.project.environment_file"}}' |
|
|
while IFS="$(printf '\t')" read -r container project service config_files env_file; do
|
|
[ -n "$container" ] || continue
|
|
[ -n "$service" ] || continue
|
|
[ -n "$config_files" ] || continue
|
|
|
|
config_file="${config_files%%,*}"
|
|
[ -f "$config_file" ] || continue
|
|
|
|
compose_args=(-f "$config_file")
|
|
if [ -n "$env_file" ] && [ -f "$env_file" ]; then
|
|
compose_args+=(--env-file "$env_file")
|
|
fi
|
|
|
|
expected="$(
|
|
docker compose "${compose_args[@]}" config --format json 2>/dev/null |
|
|
jq -r --arg service "$service" '.services[$service].image // empty' 2>/dev/null || true
|
|
)"
|
|
[ -n "$expected" ] || continue
|
|
|
|
running="$(docker inspect -f '{{.Config.Image}}' "$container" 2>/dev/null || true)"
|
|
[ -n "$running" ] || continue
|
|
|
|
match="0"
|
|
if [ "$running" = "$expected" ]; then
|
|
match="1"
|
|
fi
|
|
|
|
printf 'homelab_gitops_runtime_image_match{name="%s",project="%s",service="%s"} %s\n' \
|
|
"$container" "$project" "$service" "$match"
|
|
done
|
|
}
|
|
|
|
{
|
|
cat <<'EOF'
|
|
# HELP homelab_textfile_exporter_last_run_timestamp_seconds Unix timestamp of the last successful homelab textfile exporter run.
|
|
# TYPE homelab_textfile_exporter_last_run_timestamp_seconds gauge
|
|
EOF
|
|
printf 'homelab_textfile_exporter_last_run_timestamp_seconds %s\n' "$now"
|
|
|
|
cat <<'EOF'
|
|
# HELP homelab_critical_container_running Whether a critical container is currently running according to docker inspect.
|
|
# TYPE homelab_critical_container_running gauge
|
|
EOF
|
|
for container in $CRITICAL_CONTAINERS; do
|
|
running="0"
|
|
if docker inspect -f '{{.State.Running}}' "$container" 2>/dev/null | grep -qx true; then
|
|
running="1"
|
|
fi
|
|
printf 'homelab_critical_container_running{name="%s"} %s\n' "$container" "$running"
|
|
done
|
|
|
|
emit_gitops_runtime_image_metrics
|
|
|
|
cat <<'EOF'
|
|
# HELP homelab_borg_last_completed_timestamp_seconds Unix timestamp of the most recent completed Borg backup job known to Borg UI.
|
|
# TYPE homelab_borg_last_completed_timestamp_seconds gauge
|
|
# HELP homelab_borg_last_success Whether the most recent Borg backup job completed successfully.
|
|
# TYPE homelab_borg_last_success gauge
|
|
# HELP homelab_borg_last_job_warning Whether the most recent Borg backup job completed with warnings.
|
|
# TYPE homelab_borg_last_job_warning gauge
|
|
# HELP homelab_borg_repository_last_check_timestamp_seconds Unix timestamp of the latest Borg repository check known to Borg UI.
|
|
# TYPE homelab_borg_repository_last_check_timestamp_seconds gauge
|
|
# HELP homelab_borg_scope_expected_file_present Whether the expected Borg source list file is visible inside Borg UI.
|
|
# TYPE homelab_borg_scope_expected_file_present gauge
|
|
# HELP homelab_borg_scope_expected_sources_total Number of expected Borg source paths from the repo source list.
|
|
# TYPE homelab_borg_scope_expected_sources_total gauge
|
|
# HELP homelab_borg_scope_configured_sources_total Number of Borg source paths configured in Borg UI.
|
|
# TYPE homelab_borg_scope_configured_sources_total gauge
|
|
# HELP homelab_borg_scope_missing_sources_total Number of expected Borg source paths missing from Borg UI.
|
|
# TYPE homelab_borg_scope_missing_sources_total gauge
|
|
# HELP homelab_borg_scope_extra_sources_total Number of Borg UI source paths not present in the repo source list.
|
|
# TYPE homelab_borg_scope_extra_sources_total gauge
|
|
# HELP homelab_borg_scope_source_configured Whether an expected Borg source path is configured in Borg UI.
|
|
# TYPE homelab_borg_scope_source_configured gauge
|
|
# HELP homelab_borg_schedule_prune_after_enabled Whether a Borg scheduled job runs prune after backup.
|
|
# TYPE homelab_borg_schedule_prune_after_enabled gauge
|
|
# HELP homelab_borg_schedule_compact_after_enabled Whether a Borg scheduled job runs compact after backup.
|
|
# TYPE homelab_borg_schedule_compact_after_enabled gauge
|
|
EOF
|
|
|
|
if docker inspect "$BORG_CONTAINER" >/dev/null 2>&1; then
|
|
docker exec -i -e BORG_EXPECTED_SOURCES_FILE="$BORG_EXPECTED_SOURCES_FILE" "$BORG_CONTAINER" python3 - <<'PY'
|
|
import datetime as dt
|
|
import json
|
|
import os
|
|
from pathlib import Path
|
|
import sqlite3
|
|
|
|
conn = sqlite3.connect("/data/borg.db")
|
|
conn.row_factory = sqlite3.Row
|
|
cur = conn.cursor()
|
|
|
|
latest = cur.execute("""
|
|
select status, completed_at, archive_name
|
|
from backup_jobs
|
|
order by coalesce(started_at, created_at) desc
|
|
limit 1
|
|
""").fetchone()
|
|
|
|
completed = cur.execute("""
|
|
select completed_at, archive_name
|
|
from backup_jobs
|
|
where status in ('completed', 'completed_with_warnings')
|
|
and completed_at is not null
|
|
order by completed_at desc
|
|
limit 1
|
|
""").fetchone()
|
|
|
|
def parse_ts(value):
|
|
if not value:
|
|
return 0
|
|
value = value.replace("Z", "+00:00")
|
|
try:
|
|
parsed = dt.datetime.fromisoformat(value)
|
|
except ValueError:
|
|
try:
|
|
parsed = dt.datetime.strptime(value, "%Y-%m-%d %H:%M:%S")
|
|
except ValueError:
|
|
return 0
|
|
if parsed.tzinfo is None:
|
|
parsed = parsed.replace(tzinfo=dt.timezone.utc)
|
|
return int(parsed.timestamp())
|
|
|
|
def escape_label(value):
|
|
return (value or "").replace("\\", "\\\\").replace('"', '\\"')
|
|
|
|
def bool_metric(value):
|
|
return 1 if value else 0
|
|
|
|
latest_status = latest["status"] if latest else "missing"
|
|
latest_success = 1 if latest_status in ("completed", "completed_with_warnings") else 0
|
|
latest_warning = 1 if latest_status == "completed_with_warnings" else 0
|
|
completed_ts = parse_ts(completed["completed_at"]) if completed else 0
|
|
latest_archive = escape_label(latest["archive_name"] if latest else "")
|
|
completed_archive = escape_label(completed["archive_name"] if completed else "")
|
|
|
|
print(f'homelab_borg_last_success{{status="{latest_status}",archive="{latest_archive}"}} {latest_success}')
|
|
print(f'homelab_borg_last_job_warning{{status="{latest_status}",archive="{latest_archive}"}} {latest_warning}')
|
|
print(f'homelab_borg_last_completed_timestamp_seconds{{archive="{completed_archive}"}} {completed_ts}')
|
|
|
|
repo = cur.execute("""
|
|
select id, name, source_directories, last_check
|
|
from repositories
|
|
order by id
|
|
limit 1
|
|
""").fetchone()
|
|
|
|
if repo:
|
|
repo_name = escape_label(repo["name"] or str(repo["id"]))
|
|
print(f'homelab_borg_repository_last_check_timestamp_seconds{{repository="{repo_name}"}} {parse_ts(repo["last_check"])}')
|
|
|
|
try:
|
|
configured_sources = json.loads(repo["source_directories"] or "[]")
|
|
except json.JSONDecodeError:
|
|
configured_sources = []
|
|
else:
|
|
configured_sources = []
|
|
|
|
expected_path = Path(os.environ.get("BORG_EXPECTED_SOURCES_FILE", ""))
|
|
expected_file_present = expected_path.is_file()
|
|
if expected_file_present:
|
|
expected_sources = [
|
|
line.strip()
|
|
for line in expected_path.read_text(encoding="utf-8").splitlines()
|
|
if line.strip() and not line.lstrip().startswith("#")
|
|
]
|
|
else:
|
|
expected_sources = []
|
|
|
|
configured_set = set(configured_sources)
|
|
expected_set = set(expected_sources)
|
|
missing_sources = [source for source in expected_sources if source not in configured_set]
|
|
extra_sources = [source for source in configured_sources if source not in expected_set]
|
|
|
|
print(f"homelab_borg_scope_expected_file_present {bool_metric(expected_file_present)}")
|
|
print(f"homelab_borg_scope_expected_sources_total {len(expected_sources)}")
|
|
print(f"homelab_borg_scope_configured_sources_total {len(configured_sources)}")
|
|
print(f"homelab_borg_scope_missing_sources_total {len(missing_sources)}")
|
|
print(f"homelab_borg_scope_extra_sources_total {len(extra_sources)}")
|
|
|
|
for source in expected_sources:
|
|
value = 1 if source in configured_set else 0
|
|
print(f'homelab_borg_scope_source_configured{{source="{escape_label(source)}"}} {value}')
|
|
|
|
for source in extra_sources:
|
|
print(f'homelab_borg_scope_source_configured{{source="{escape_label(source)}",state="extra"}} 0')
|
|
|
|
for schedule in cur.execute("""
|
|
select id, name, run_prune_after, run_compact_after
|
|
from scheduled_jobs
|
|
where enabled = 1
|
|
order by id
|
|
"""):
|
|
schedule_name = escape_label(schedule["name"] or str(schedule["id"]))
|
|
print(f'homelab_borg_schedule_prune_after_enabled{{schedule="{schedule_name}"}} {bool_metric(schedule["run_prune_after"])}')
|
|
print(f'homelab_borg_schedule_compact_after_enabled{{schedule="{schedule_name}"}} {bool_metric(schedule["run_compact_after"])}')
|
|
PY
|
|
else
|
|
printf 'homelab_borg_last_success{status="container_missing",archive=""} 0\n'
|
|
printf 'homelab_borg_last_job_warning{status="container_missing",archive=""} 0\n'
|
|
printf 'homelab_borg_last_completed_timestamp_seconds{archive=""} 0\n'
|
|
printf 'homelab_borg_repository_last_check_timestamp_seconds{repository=""} 0\n'
|
|
printf 'homelab_borg_scope_expected_file_present 0\n'
|
|
printf 'homelab_borg_scope_expected_sources_total 0\n'
|
|
printf 'homelab_borg_scope_configured_sources_total 0\n'
|
|
printf 'homelab_borg_scope_missing_sources_total 0\n'
|
|
printf 'homelab_borg_scope_extra_sources_total 0\n'
|
|
fi
|
|
|
|
# Dump-Frische host-seitig messen. Schliesst den Blindfleck, dass Borg
|
|
# weiterlaeuft und stale Dumps archiviert, ohne dass ein Job-Fehler entsteht
|
|
# (pre-backup-dumps.sh gestoppt). Laeuft ausserhalb des borg-ui-Containers,
|
|
# weil die Dumps host-seitig unter $BORG_DUMP_DIR liegen.
|
|
cat <<'EOF'
|
|
# HELP homelab_borg_dump_present Whether an expected Borg pre-backup dump artifact exists in the latest dump set.
|
|
# TYPE homelab_borg_dump_present gauge
|
|
# HELP homelab_borg_dump_age_seconds Age in seconds of an expected Borg pre-backup dump artifact.
|
|
# TYPE homelab_borg_dump_age_seconds gauge
|
|
EOF
|
|
for dump in \
|
|
postgresql17-globals.sql \
|
|
postgresql17-mailarchiver.dump \
|
|
postgresql17-paperless.dump \
|
|
mealie.dump \
|
|
immich.dump \
|
|
nextcloud.dump \
|
|
gitea.sqlite.dump \
|
|
vaultwarden.sqlite.dump \
|
|
n8n.sqlite.dump \
|
|
unraid-flash-config.tar.gz \
|
|
komodo-mongo.archive.gz; do
|
|
dump_path="$BORG_DUMP_DIR/$dump"
|
|
if [ -f "$dump_path" ]; then
|
|
dump_mtime="$(stat -c %Y "$dump_path" 2>/dev/null || echo 0)"
|
|
printf 'homelab_borg_dump_present{dump="%s"} 1\n' "$dump"
|
|
printf 'homelab_borg_dump_age_seconds{dump="%s"} %s\n' "$dump" "$(( now - dump_mtime ))"
|
|
else
|
|
printf 'homelab_borg_dump_present{dump="%s"} 0\n' "$dump"
|
|
fi
|
|
done
|
|
} > "$tmp"
|
|
|
|
# 0644 statt mktemp-default 0600, damit der node-exporter-Textfile-Collector
|
|
# (laeuft als nobody:65534) die Datei lesen kann.
|
|
chmod 644 "$tmp"
|
|
mv "$tmp" "$OUTPUT_FILE"
|
|
trap - EXIT
|
|
|
|
printf '%s\n' "$OUTPUT_FILE"
|