Files
homelab-infra/services/posture-check/komodo-stack-hygiene.sh
T
Micha ad9bb40b95 Harden posture/borg audit scripts (robustness + coverage)
Working-tree improvements to the audit scripts (authored locally, not by me;
reviewed for correctness + bash -n clean before commit):

- compose-runtime-drift: prefer `docker compose config` for the expected image
  with a raw-parse fallback; raw parser now resolves YAML anchors (*alias) so
  anchor-based composes (e.g. dawarich) no longer mis-report drift.
- komodo-stack-hygiene: treat an unreachable Komodo API as critical and exit 3
  so the Healthchecks EXIT trap sends /fail (the monitor itself is down, not
  "all green"); git fetch before hash-drift compare; clearer "cannot compare"
  message; pin in-container km host to localhost:9120.
- cert-token-check: expand monitored cert domains to the full set incl.
  hc.kaleschke.info.
- gitea-bundle-mirror: skip empty repos without refs instead of failing.
- unraid-user-scripts.md: document SEND_NTFY/NTFY_TOPIC for the daily report.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-24 11:35:55 +02:00

273 lines
11 KiB
Bash

#!/usr/bin/env bash
set -euo pipefail
# Komodo-Stack-Hygiene-Check.
#
# Prueft, dass jeder Komodo-Stack sauber gegen das Git-Repo konfiguriert ist,
# und dass jeder Compose-File im Repo einen passenden Komodo-Stack hat.
# Findet die Klasse von Fehlern, die `immich_new` (2026-06-12) durchgelassen
# hat: Stack RUNNING, aber kein Repo / kein Account / project_missing.
REPO_ROOT="${REPO_ROOT:-$(cd "$(dirname "${BASH_SOURCE[0]}")/../.." && pwd)}"
OUTPUT_PATH="${OUTPUT_PATH:-/mnt/user/services/posture-check/komodo-stack-hygiene-last.json}"
NTFY_SCRIPT="${NTFY_SCRIPT:-$REPO_ROOT/ops/restore-tests/send-ntfy.sh}"
NTFY_TOPIC="${NTFY_TOPIC:-homelab-alerts}"
SEND_NTFY="${SEND_NTFY:-1}"
KOMODO_ENV_FILE="${KOMODO_ENV_FILE:-/mnt/user/appdata/secrets/codex_komodo_api.env}"
KOMODO_CONTAINER="${KOMODO_CONTAINER:-komodo-core}"
KOMODO_CLI_HOST_FOR_CONTAINER="${KOMODO_CLI_HOST_FOR_CONTAINER:-http://localhost:9120}"
FETCH_BEFORE_DIFF="${FETCH_BEFORE_DIFF:-1}"
# Komma-separierte Allowlist fuer bewusst inline-managed Stacks.
# Quelle: memory/komodo-stack-inline-managed.md, CLAUDE.md.
INLINE_ALLOWLIST="${INLINE_ALLOWLIST:-komodo,grafana}"
# Compose-Files unter diesen Pfaden zaehlen NICHT als erwartete Stacks
# (Beispiele, Archive, Submodule).
COMPOSE_EXCLUDE_PATTERN="${COMPOSE_EXCLUDE_PATTERN:-/archive/|/examples/|/.git/}"
# Compose-Dir-Namen, die bewusst NICHT als Komodo-Stack laufen sollen
# (Work-in-progress, Build-/Dev-Compose, manuell deployed). Komma-separiert.
EXPECTED_NOT_IN_KOMODO="${EXPECTED_NOT_IN_KOMODO:-hermes-agent}"
TMP_DIR="${TMP_DIR:-/tmp/kallilab-komodo-stack-hygiene}"
mkdir -p "$TMP_DIR"
RESULTS_FILE="$TMP_DIR/results.$$"
STACKS_FILE="$TMP_DIR/stacks.$$.json"
API_ERROR_FILE="$TMP_DIR/komodo-api.$$.err"
API_UNREACHABLE=0
: > "$RESULTS_FILE"
: > "$API_ERROR_FILE"
# Healthchecks Heartbeat (endpoint-agnostisch; Capability-URL ist ein Secret, nie ins Repo)
HC_URL_FILE="${HC_URL_FILE:-/mnt/user/appdata/secrets/healthchecks_komodo_hygiene_url}"
hc_url=""; [ -r "$HC_URL_FILE" ] && hc_url="$(tr -d '[:space:]' < "$HC_URL_FILE")"
hc_ping() { [ -n "$hc_url" ] || return 0; curl -fsS -m 10 --retry 3 "${hc_url}${1:-}" >/dev/null 2>&1 || true; }
trap 'hc_rc=$?; rm -f "$RESULTS_FILE" "$STACKS_FILE" "$API_ERROR_FILE"; [ "$hc_rc" -le 2 ] && hc_ping "" || hc_ping "/fail"' EXIT
hc_ping "/start"
json_escape() {
sed -e 's/\\/\\\\/g' -e 's/"/\\"/g' -e 's/\t/\\t/g'
}
add_result() {
printf '%s\t%s\t%s\n' "$1" "$2" "$3" >> "$RESULTS_FILE"
}
is_inline_allowed() {
local name="$1"
local IFS=,
for entry in $INLINE_ALLOWLIST; do
[ "$name" = "$entry" ] && return 0
done
return 1
}
is_expected_not_in_komodo() {
local name="$1"
local IFS=,
for entry in $EXPECTED_NOT_IN_KOMODO; do
[ "$name" = "$entry" ] && return 0
done
return 1
}
# True drift: do files inside this stack's compose-dir actually differ
# between deployed_hash and latest_hash? Komodo's deployed_hash bumps only
# on redeploy, while latest_hash tracks master HEAD - that produces a noisy
# "Pending Update" even when the stack itself wasn't touched.
stack_files_changed() {
local name="$1" deployed="$2" latest="$3"
local dir
HASH_COMPARE_REASON=""
# Locate the stack's compose dir (case-insensitive, same as Mode 3).
dir="$(find "$REPO_ROOT" -type d -iname "$name" -not -path "*/.git/*" 2>/dev/null | head -1)"
if [ -z "$dir" ]; then
HASH_COMPARE_REASON="no compose directory found for stack"
return 1
fi
if ! ( cd "$REPO_ROOT" && git rev-parse --verify --quiet "$deployed" >/dev/null ); then
HASH_COMPARE_REASON="deployed_hash $deployed is not available in local repo"
return 1
fi
if ! ( cd "$REPO_ROOT" && git rev-parse --verify --quiet "$latest" >/dev/null ); then
HASH_COMPARE_REASON="latest_hash $latest is not available in local repo"
return 1
fi
local rel="${dir#$REPO_ROOT/}"
if ( cd "$REPO_ROOT" && git diff --quiet "$deployed".."$latest" -- "$rel" ); then
return 1 # no change
fi
return 0 # real change
}
if [ "$FETCH_BEFORE_DIFF" = "1" ]; then
if ! ( cd "$REPO_ROOT" && git fetch --quiet origin >/dev/null 2>&1 ); then
add_result "warning" "repo-fetch" "Could not fetch origin before hash drift comparisons"
fi
fi
# Komodo-API-Credentials laden und Stack-Liste holen.
if [ ! -r "$KOMODO_ENV_FILE" ]; then
API_UNREACHABLE=1
add_result "critical" "komodo-api" "Komodo env file not readable: $KOMODO_ENV_FILE"
else
set -a
# shellcheck disable=SC1090
. "$KOMODO_ENV_FILE"
set +a
if ! docker exec \
-e "KOMODO_CLI_HOST=$KOMODO_CLI_HOST_FOR_CONTAINER" \
-e KOMODO_CLI_KEY \
-e KOMODO_CLI_SECRET \
"$KOMODO_CONTAINER" km list -a stacks -f json > "$STACKS_FILE" 2>"$API_ERROR_FILE"; then
API_UNREACHABLE=1
api_error="$(tr '\n' ' ' < "$API_ERROR_FILE" | sed -E 's/[[:space:]]+/ /g' | cut -c 1-180)"
add_result "critical" "komodo-api" "km list stacks failed (container=$KOMODO_CONTAINER): ${api_error:-unknown error}"
: > "$STACKS_FILE"
fi
fi
# Per-Stack-Checks. Trenner: "|" statt Tab, weil IFS=Tab leere Felder kollabiert
# (Tab ist Whitespace in IFS). "|" kommt in Stack-Namen/Repos/Hashes nicht vor.
if [ -s "$STACKS_FILE" ]; then
while IFS='|' read -r name repo project_missing missing_files state deployed_hash latest_hash files_on_host file_contents; do
[ -n "$name" ] || continue
if is_inline_allowed "$name"; then
add_result "ok" "$name" "Inline-managed (allowlisted), skipping repo checks"
continue
fi
# Failure-Mode 1: Stack hat keine Git-Quelle (immich_new-Symptom).
if [ "$repo" = "-" ] && [ "$files_on_host" != "True" ] && [ "$file_contents" != "True" ]; then
add_result "critical" "$name" "Stack has no repo configured and is not inline-allowed"
continue
fi
# Failure-Mode 2: Komodo meldet Project Missing.
if [ "$project_missing" = "True" ]; then
add_result "critical" "$name" "project_missing=true (missing_files=$missing_files)"
continue
fi
# Failure-Mode 3: Stack-Name passt zu keinem Compose-File im Repo.
# Case-insensitive (Compose-Dir kann GroSs/klein abweichen, z.B. Adguard).
match_found=""
while IFS= read -r dir; do
[ -n "$dir" ] || continue
if [ -f "$dir/docker-compose.yml" ] \
|| [ -f "$dir/docker-compose.yaml" ] \
|| [ -f "$dir/compose.yml" ] \
|| [ -f "$dir/compose.yaml" ]; then
match_found=1
break
fi
done < <(find "$REPO_ROOT" -type d -iname "$name" -not -path "*/.git/*" 2>/dev/null)
if [ -z "$match_found" ]; then
# Verwaiste Stacks wie das frueher gesehene `immich_new`: Komodo kennt
# ihn, aber im Repo gibt's keinen Compose-Pfad.
add_result "warning" "$name" "Stack name does not match any compose directory in repo"
fi
# Failure-Mode 4: Deployed-Hash hinkt latest hinterher UND der Stack-Dir
# hat tatsaechlich File-Aenderungen dazwischen. Reine Komodo-Hash-Bewegung
# ohne Stack-Inhalt aendert nichts und ist kein echter Drift.
# "-" = unbekannt (z.B. gitea self-host edge case), nicht als Drift werten.
if [ "$deployed_hash" != "-" ] && [ "$latest_hash" != "-" ] \
&& [ "$deployed_hash" != "$latest_hash" ]; then
if stack_files_changed "$name" "$deployed_hash" "$latest_hash"; then
add_result "warning" "$name" "deployed_hash $deployed_hash != latest_hash $latest_hash (stack files changed)"
elif [ -n "${HASH_COMPARE_REASON:-}" ]; then
add_result "warning" "$name" "deployed_hash $deployed_hash != latest_hash $latest_hash (cannot compare: $HASH_COMPARE_REASON)"
fi
fi
# Failure-Mode 5: Stack ist down.
if [ "$state" = "down" ] || [ "$state" = "unknown" ]; then
add_result "warning" "$name" "Stack state is $state"
fi
add_result "ok" "$name" "Stack hygiene OK (state=$state, hash=$deployed_hash)"
done < <(jq -r '.[] | [
.name // "-",
(.info.repo // "-"),
(.info.project_missing | if . then "True" else "False" end),
(((.info.missing_files // []) | join(",")) | if . == "" then "-" else . end),
(.info.state // "-"),
(.info.deployed_hash // "-"),
(.info.latest_hash // "-"),
(.info.files_on_host | if . then "True" else "False" end),
(.info.file_contents | if . then "True" else "False" end)
] | join("|")' "$STACKS_FILE")
fi
# Failure-Mode 6: Compose-File im Repo, aber kein Komodo-Stack mit gleichem Namen.
if [ -s "$STACKS_FILE" ]; then
known_names="$(jq -r '.[].name' "$STACKS_FILE")"
while IFS= read -r -d '' compose; do
rel="${compose#$REPO_ROOT/}"
if printf '%s' "$rel" | grep -Eq "$COMPOSE_EXCLUDE_PATTERN"; then
continue
fi
dir_name="$(basename "$(dirname "$compose")")"
if is_inline_allowed "$dir_name"; then
continue
fi
if is_expected_not_in_komodo "$dir_name"; then
continue
fi
# Case-insensitive, weil z.B. host-services/Adguard <-> Komodo-Stack adguard
# legitim als gematched gilt.
if ! printf '%s\n' "$known_names" | grep -Fixq "$dir_name"; then
add_result "warning" "$dir_name" "Compose file $rel has no matching Komodo stack"
fi
done < <(find "$REPO_ROOT" -path "$REPO_ROOT/.git" -prune -o -type f \
\( -name docker-compose.yml -o -name docker-compose.yaml \
-o -name compose.yml -o -name compose.yaml \) -print0)
fi
timestamp="$(date -Iseconds)"
critical_count="$(awk -F '\t' '$1 == "critical" { c++ } END { print c + 0 }' "$RESULTS_FILE")"
warning_count="$(awk -F '\t' '$1 == "warning" { c++ } END { print c + 0 }' "$RESULTS_FILE")"
status="ok"
[ "$warning_count" -gt 0 ] && status="warning"
[ "$critical_count" -gt 0 ] && status="critical"
mkdir -p "$(dirname "$OUTPUT_PATH")"
{
printf '{\n'
printf ' "timestamp": "%s",\n' "$(printf '%s' "$timestamp" | json_escape)"
printf ' "status": "%s",\n' "$status"
printf ' "critical_count": %s,\n' "$critical_count"
printf ' "warning_count": %s,\n' "$warning_count"
printf ' "checks": [\n'
first=1
while IFS=$'\t' read -r severity name message; do
if [ "$first" -eq 0 ]; then printf ',\n'; fi
first=0
printf ' {"severity":"%s","name":"%s","message":"%s"}' \
"$(printf '%s' "$severity" | json_escape)" \
"$(printf '%s' "$name" | json_escape)" \
"$(printf '%s' "$message" | json_escape)"
done < "$RESULTS_FILE"
printf '\n ]\n}\n'
} > "$OUTPUT_PATH.tmp"
mv "$OUTPUT_PATH.tmp" "$OUTPUT_PATH"
cat "$OUTPUT_PATH"
if [ "$critical_count" -gt 0 ] || [ "$warning_count" -gt 0 ]; then
if [ "$SEND_NTFY" = "1" ] && [ -x "$NTFY_SCRIPT" ]; then
priority="default"
[ "$warning_count" -gt 0 ] && priority="high"
[ "$critical_count" -gt 0 ] && priority="urgent"
"$NTFY_SCRIPT" "$NTFY_TOPIC" \
"Komodo stack hygiene: $critical_count critical, $warning_count warning" \
"See $OUTPUT_PATH" "$priority" || true
fi
# If Komodo could not be queried at all, the hygiene monitor itself is broken.
# Use rc=3 so the Healthchecks EXIT trap sends /fail instead of a green ping.
[ "$API_UNREACHABLE" -eq 1 ] && exit 3
[ "$critical_count" -gt 0 ] && exit 2
exit 1
fi