report: unhealthy-Container namentlich + Image-Age-Allowlist

Zwei Verbesserungen am Daily Operations Report, ausgeloest durch den
versteckten immich_machine_learning-Ausfall (lief 2,3 Tage unhealthy,
weil der Report nur "unhealthy=1" zaehlte, ohne Name/Grund):

1. collect_container_state: neue Sektion "Unhealthy Container" listet jeden
   unhealthy Container mit FailingStreak und letztem Healthcheck-Output.
   So ist sofort sichtbar WELCHER Container und WARUM.

2. collect_image_freshness: neue Image-Age-Allowlist
   (image-age-allow.patterns). Bewusst gepinnte, aber aktuelle/empfohlene
   Images (immich_postgres = exakt Immichs Pin; blackbox-exporter v0.28.0 =
   latest) werden mit Recheck-Datum von der Ueberalterungs-Warnung
   ausgenommen. Nach Ablauf des Recheck-Datums greift die Ausnahme nicht
   mehr -> erzwingt Neubewertung statt stillen Alterns. Top-10-Tabelle hat
   jetzt eine Hinweis-Spalte (ueberaltert / bewusst gepinnt).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-10 11:08:44 +02:00
parent ed55b88ec1
commit 2f64aee109
2 changed files with 102 additions and 10 deletions
+72 -10
View File
@@ -11,6 +11,7 @@ SINCE="${SINCE:-24h}"
MAX_LOG_LINES="${MAX_LOG_LINES:-80}"
CERT_MAX_ROWS="${CERT_MAX_ROWS:-12}"
IMAGE_AGE_WARN_DAYS="${IMAGE_AGE_WARN_DAYS:-180}"
IMAGE_AGE_ALLOW_FILE="${IMAGE_AGE_ALLOW_FILE:-/mnt/user/services/homelab-infra/services/posture-check/image-age-allow.patterns}"
LOG_VOLUME_TOP_N="${LOG_VOLUME_TOP_N:-10}"
DISK_USAGE_WARN_PCT="${DISK_USAGE_WARN_PCT:-85}"
CERT_WARN_DAYS="${CERT_WARN_DAYS:-21}"
@@ -581,13 +582,36 @@ collect_image_freshness() {
local image_file="$TMP_DIR/images.tsv"
local image_warnings=0
local image_allowed=0
local now_epoch
: > "$image_file"
now_epoch="$(date +%s)"
# Parse the image-age allowlist: container deliberately pinned to a stable or
# upstream-recommended image. Each entry carries a recheck date; once that
# date has passed the suppression lapses, so a pin gets re-reviewed instead
# of silently aging forever.
local allow_file="$TMP_DIR/image-allow.tsv"
: > "$allow_file"
if [ -f "$IMAGE_AGE_ALLOW_FILE" ]; then
while IFS= read -r line; do
line="${line%%#*}"
line="$(printf '%s' "$line" | sed -E 's/^[[:space:]]+//; s/[[:space:]]+$//')"
[ -n "$line" ] || continue
local a_name a_date a_epoch
a_name="$(printf '%s' "$line" | awk '{ print $1 }')"
a_date="$(printf '%s' "$line" | awk '{ print $2 }')"
[ -n "$a_name" ] && [ -n "$a_date" ] || continue
a_epoch="$(date -d "$a_date" +%s 2>/dev/null || echo 0)"
if [ "$a_epoch" -ge "$now_epoch" ]; then
printf '%s\t%s\n' "$a_name" "$a_date" >> "$allow_file"
fi
done < "$IMAGE_AGE_ALLOW_FILE"
fi
while IFS= read -r name; do
[ -n "$name" ] || continue
local image_id created_iso created_epoch age_days image_tag
local image_id created_iso created_epoch age_days image_tag note recheck
image_id="$(docker inspect --format '{{.Image}}' "$name" 2>/dev/null || true)"
[ -n "$image_id" ] || continue
created_iso="$(docker image inspect --format '{{.Created}}' "$image_id" 2>/dev/null || true)"
@@ -596,33 +620,46 @@ collect_image_freshness() {
created_epoch="$(date -d "$created_iso" +%s 2>/dev/null || echo 0)"
[ "$created_epoch" -gt 0 ] || continue
age_days=$(( (now_epoch - created_epoch) / 86400 ))
printf '%d\t%s\t%s\n' "$age_days" "$name" "$image_tag" >> "$image_file"
note=""
if [ "$age_days" -ge "$IMAGE_AGE_WARN_DAYS" ]; then
image_warnings=$((image_warnings + 1))
recheck="$(awk -F '\t' -v n="$name" '$1 == n { print $2; found = 1 } END { exit !found }' "$allow_file" || true)"
if [ -n "$recheck" ]; then
note="bewusst gepinnt (recheck $recheck)"
image_allowed=$((image_allowed + 1))
else
note="ueberaltert"
image_warnings=$((image_warnings + 1))
fi
fi
printf '%d\t%s\t%s\t%s\n' "$age_days" "$name" "$image_tag" "$note" >> "$image_file"
done < <(docker ps --format '{{.Names}}')
set_summary "image_warnings" "$image_warnings"
set_summary "image_allowed" "$image_allowed"
if [ ! -s "$image_file" ]; then
append "- Keine Image-Daten verfuegbar."
record_section_error "images" "Keine Image-Daten ermittelt"
else
append "- Schwelle Warnung: Image aelter als $IMAGE_AGE_WARN_DAYS Tage"
append "- Container mit Image >= $IMAGE_AGE_WARN_DAYS Tage: $image_warnings"
append "- Container mit ueberaltertem Image (gewarnt): $image_warnings"
append "- Davon bewusst gepinnt (von Warnung ausgenommen): $image_allowed"
append "- Allowlist-Quelle: \`$IMAGE_AGE_ALLOW_FILE\`"
append ""
append "### Aelteste Images (Top 10)"
append ""
append "| Alter Tage | Container | Image |"
append "|---:|---|---|"
sort -nr "$image_file" | head -n 10 | while IFS="$(printf '\t')" read -r age name img; do
append "| $age | $name | $img |"
append "| Alter Tage | Container | Image | Hinweis |"
append "|---:|---|---|---|"
sort -nr "$image_file" | head -n 10 | while IFS="$(printf '\t')" read -r age name img note; do
append "| $age | $name | $img | ${note:-} |"
done
append ""
if [ "$image_warnings" -eq 0 ]; then
if [ "$image_warnings" -eq 0 ] && [ "$image_allowed" -eq 0 ]; then
append "Bewertung: Keine Container mit ueberalterten Images. CVE-Hygiene aus dieser Sicht ok."
elif [ "$image_warnings" -eq 0 ]; then
append "Bewertung: Keine ungeprueft ueberalterten Images. $image_allowed Container sind bewusst gepinnt und mit Recheck-Datum dokumentiert."
else
append "Bewertung: $image_warnings Container nutzen Images aelter als $IMAGE_AGE_WARN_DAYS Tage. Update-Pipeline und CVE-Status pruefen."
append "Bewertung: $image_warnings Container nutzen ueberalterte Images (nicht in der Allowlist). Update-Pipeline und CVE-Status pruefen."
fi
fi
append ""
@@ -663,6 +700,31 @@ collect_container_events() {
collect_container_state() {
append "## Container-Zustand"
append ""
append "### Unhealthy Container"
local unhealthy_file="$TMP_DIR/unhealthy.log"
docker ps --filter health=unhealthy --format '{{.Names}}' > "$unhealthy_file"
if [ ! -s "$unhealthy_file" ]; then
append "- Keine."
else
append "| Container | FailingStreak | Letzter Healthcheck |"
append "|---|---:|---|"
while IFS= read -r name; do
[ -n "$name" ] || continue
local streak hc
streak="$(docker inspect "$name" --format '{{.State.Health.FailingStreak}}' 2>/dev/null || echo '?')"
# Letzten nicht-leeren Health-Log-Eintrag holen, einzeilig machen und
# Pipe-Zeichen escapen, damit die Markdown-Tabelle nicht bricht.
hc="$(docker inspect "$name" --format '{{range .State.Health.Log}}exit={{.ExitCode}} out={{.Output}}~~~{{end}}' 2>/dev/null \
| tr '\n' ' ' \
| awk -F '~~~' '{ for (i = NF - 1; i >= 1; i--) { if ($i != "") { print $i; break } } }' \
| sed -E 's/[[:space:]]+/ /g; s/\|/\\|/g' \
| cut -c1-160)"
append "| \`$name\` | ${streak:-?} | ${hc:-(kein Output)} |"
done < "$unhealthy_file"
fi
append ""
append "### Nicht laufende Container"
local stopped_file="$TMP_DIR/stopped.log"
docker ps -a --filter status=exited --filter status=dead --filter status=created --format '{{.Names}}\t{{.Status}}' > "$stopped_file"