homelab-infra/services/posture-check/daily-status-report.sh

#!/usr/bin/env bash
set -euo pipefail

SCRIPT_START="$(date +%s)"

REPORT_DIR="${REPORT_DIR:-/mnt/user/services/posture-check/daily-reports}"
REPORT_DATE="${REPORT_DATE:-$(date +%F)}"
REPORT_PATH="${REPORT_PATH:-$REPORT_DIR/homelab-day-$REPORT_DATE.md}"
PERSISTENT_SUMMARY_PATH="${PERSISTENT_SUMMARY_PATH:-$REPORT_DIR/summary-$REPORT_DATE.env}"
SINCE="${SINCE:-24h}"
MAX_LOG_LINES="${MAX_LOG_LINES:-80}"
CERT_MAX_ROWS="${CERT_MAX_ROWS:-12}"
IMAGE_AGE_WARN_DAYS="${IMAGE_AGE_WARN_DAYS:-180}"
LOG_VOLUME_TOP_N="${LOG_VOLUME_TOP_N:-10}"
DISK_USAGE_WARN_PCT="${DISK_USAGE_WARN_PCT:-85}"
CERT_WARN_DAYS="${CERT_WARN_DAYS:-21}"
BACKUP_DRIFT_FACTOR="${BACKUP_DRIFT_FACTOR:-2.0}"
SHOW_KNOWN_NOISE="${SHOW_KNOWN_NOISE:-0}"
SEND_MAIL="${SEND_MAIL:-0}"
MAIL_MODE="${MAIL_MODE:-always}"
MAIL_SCRIPT="${MAIL_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/send-operations-report-mail.sh}"
SEND_NTFY="${SEND_NTFY:-0}"
NTFY_TOPIC="${NTFY_TOPIC:-homelab-info}"
NTFY_SCRIPT="${NTFY_SCRIPT:-/mnt/user/services/homelab-infra/ops/restore-tests/send-ntfy.sh}"
BORG_CONTAINER="${BORG_CONTAINER:-borg-ui}"
PROMETHEUS_CONTAINER="${PROMETHEUS_CONTAINER:-monitoring-prometheus}"
TRAEFIK_ACME_PATH="${TRAEFIK_ACME_PATH:-/mnt/user/appdata/traefik/letsencrypt/acme.json}"
NOISE_PATTERNS_FILE="${NOISE_PATTERNS_FILE:-/mnt/user/services/homelab-infra/services/posture-check/log-noise.patterns}"
NORMALIZE_NOISE_SCRIPT="${NORMALIZE_NOISE_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/lib/normalize-noise-patterns.sh}"
NOISE_ESCALATION_THRESHOLD="${NOISE_ESCALATION_THRESHOLD:-500}"
NOISE_BREAKDOWN_TOP_N="${NOISE_BREAKDOWN_TOP_N:-10}"
POSTURE_CHECK_FILE="${POSTURE_CHECK_FILE:-/mnt/user/services/posture-check/last.json}"
LOCK_FILE="${LOCK_FILE:-/tmp/homelab-daily-report.lock}"
REPORT_STATUS="UNKNOWN"

exec 9>"$LOCK_FILE"
if ! flock -n 9; then
  echo "Another daily-status-report run is already in progress (lock: $LOCK_FILE)" >&2
  exit 3
fi

TMP_DIR="$(mktemp -d /tmp/homelab-daily-report.XXXXXX)"
BODY_PATH="$TMP_DIR/body.md"
SUMMARY_PATH="$TMP_DIR/summary.env"
SECTION_ERRORS_FILE="$TMP_DIR/section-errors.log"
: > "$BODY_PATH"
: > "$SUMMARY_PATH"
: > "$SECTION_ERRORS_FILE"

cleanup() {
  rm -rf "$TMP_DIR"
}
trap cleanup EXIT

append() {
  printf '%s\n' "$*" >> "$BODY_PATH"
}

append_block() {
  cat >> "$BODY_PATH"
}

set_summary() {
  printf '%s=%s\n' "$1" "$2" >> "$SUMMARY_PATH"
}

record_section_error() {
  printf '%s: %s\n' "$1" "$2" >> "$SECTION_ERRORS_FILE"
}

have_container() {
  docker inspect "$1" >/dev/null 2>&1
}

count_lines() {
  wc -l | awk '{ print $1 + 0 }'
}

shorten() {
  sed -E 's/[[:space:]]+/ /g' | cut -c 1-260
}

format_duration() {
  local s="${1:-0}"
  if ! printf '%s' "$s" | grep -Eq '^[0-9]+$'; then
    printf '?\n'
    return
  fi
  local d=$(( s / 86400 ))
  local h=$(( (s % 86400) / 3600 ))
  local m=$(( (s % 3600) / 60 ))
  local sec=$(( s % 60 ))
  if [ "$d" -gt 0 ]; then
    printf '%d Tage %d Stunden\n' "$d" "$h"
  elif [ "$h" -gt 0 ]; then
    printf '%d Stunden %d Minuten\n' "$h" "$m"
  elif [ "$m" -gt 0 ]; then
    printf '%d Minuten %d Sekunden\n' "$m" "$sec"
  else
    printf '%d Sekunden\n' "$sec"
  fi
}

collect_overview() {
  local running total unhealthy exited_nonzero

  total="$(docker ps -a --format '{{.Names}}' | count_lines)"
  running="$(docker ps --format '{{.Names}}' | count_lines)"
  unhealthy="$(docker ps --filter health=unhealthy --format '{{.Names}}' | count_lines)"
  exited_nonzero="$(docker ps -a --filter status=exited --format '{{.Names}} {{.Status}}' | awk '!/Exited \(0\)/ { count++ } END { print count + 0 }')"

  set_summary "containers_total" "$total"
  set_summary "containers_running" "$running"
  set_summary "containers_unhealthy" "$unhealthy"
  set_summary "containers_exited_nonzero" "$exited_nonzero"

  append "## Betriebslage"
  append ""
  append "- Container: $running/$total laufen"
  append "- Unhealthy Container: $unhealthy"
  append "- Exited non-zero Container: $exited_nonzero"

  if [ -f "$POSTURE_CHECK_FILE" ]; then
    local posture_status posture_age now_epoch
    posture_status="$(sed -n 's/.*"status": *"\([^"]*\)".*/\1/p' "$POSTURE_CHECK_FILE" | head -n 1)"
    now_epoch="$(date +%s)"
    posture_age=$(( now_epoch - $(stat -c %Y "$POSTURE_CHECK_FILE" 2>/dev/null || echo "$now_epoch") ))
    append "- Letzter Posture-Check: ${posture_status:-unbekannt} (Datei ist $(format_duration "$posture_age") alt)"
    set_summary "posture_status" "${posture_status:-unknown}"
    set_summary "posture_age_seconds" "$posture_age"
  else
    append "- Letzter Posture-Check: keine Datei gefunden"
    set_summary "posture_status" "missing"
    record_section_error "overview" "Posture-Check-Datei $POSTURE_CHECK_FILE fehlt"
  fi
  append ""
}

collect_host_health() {
  append "## Host"
  append ""

  local boot_epoch boot_iso uptime_seconds load_1 load_5 load_15 now_epoch

  now_epoch="$(date +%s)"
  boot_epoch="$(awk '/^btime/ { print $2 }' /proc/stat 2>/dev/null || echo 0)"
  if [ "${boot_epoch:-0}" -gt 0 ]; then
    boot_iso="$(date -u -d "@$boot_epoch" -Iseconds 2>/dev/null || echo unknown)"
    uptime_seconds=$(( now_epoch - boot_epoch ))
  else
    boot_iso="unknown"
    uptime_seconds=0
    record_section_error "host" "/proc/stat btime nicht lesbar"
  fi
  if [ "$uptime_seconds" -lt 0 ]; then
    uptime_seconds=0
  fi

  if [ -r /proc/loadavg ]; then
    read -r load_1 load_5 load_15 _ < /proc/loadavg
  else
    load_1="?"; load_5="?"; load_15="?"
  fi

  append "- Hostname: \`$(hostname)\`"
  append "- Boot-Zeit: \`$boot_iso\`"
  append "- Uptime: $(format_duration "$uptime_seconds")"
  append "- Load average (1/5/15): $load_1 / $load_5 / $load_15"
  if [ "$uptime_seconds" -lt 86400 ]; then
    append "- WARNUNG: Boot innerhalb der letzten 24 Stunden erkannt."
    set_summary "host_recent_boot" "1"
  else
    append "- Reboot in den letzten 24h: nein"
    set_summary "host_recent_boot" "0"
  fi

  set_summary "host_uptime_seconds" "$uptime_seconds"
  set_summary "host_load_1" "$load_1"
  append ""
}

derive_report_status() {
  # shellcheck disable=SC1090
  . "$SUMMARY_PATH"

  REPORT_STATUS="OK"
  local has_warn=0 has_crit=0

  [ "${borg_status:-unknown}" != "completed" ] && has_warn=1
  [ "${prometheus_alerts:-0}" = "unknown" ] && has_warn=1
  [ "${cert_warnings:-0}" != "0" ] && has_warn=1
  [ "${disk_warnings:-0}" != "0" ] && has_warn=1
  [ "${image_warnings:-0}" != "0" ] && has_warn=1
  [ "${containers_exited_nonzero:-0}" != "0" ] && has_warn=1
  [ "${host_recent_boot:-0}" = "1" ] && has_warn=1
  [ "${backup_duration_drift:-0}" = "1" ] && has_warn=1
  [ "${noise_threshold_exceeded:-0}" != "0" ] && has_warn=1
  if [ "${prometheus_alerts_pending:-0}" != "0" ] && [ "${prometheus_alerts_pending:-0}" != "unknown" ]; then
    has_warn=1
  fi

  [ "${borg_status:-unknown}" = "failed" ] && has_crit=1
  [ "${borg_status:-unknown}" = "error" ] && has_crit=1
  [ "${containers_unhealthy:-0}" != "0" ] && has_crit=1
  if [ "${prometheus_alerts_firing:-0}" != "0" ] && [ "${prometheus_alerts_firing:-0}" != "unknown" ]; then
    has_crit=1
  fi

  if [ "$has_crit" -eq 1 ]; then
    REPORT_STATUS="KRITISCH"
  elif [ "$has_warn" -eq 1 ]; then
    REPORT_STATUS="WARNUNG"
  fi

  set_summary "report_status" "$REPORT_STATUS"
}

collect_borg() {
  append "## Borg Backup"
  append ""

  if ! have_container "$BORG_CONTAINER"; then
    append "- WARNUNG: Container \`$BORG_CONTAINER\` nicht gefunden."
    append ""
    set_summary "borg_status" "unknown"
    set_summary "backup_duration_drift" "unknown"
    record_section_error "borg" "Container $BORG_CONTAINER nicht gefunden"
    return
  fi

  if ! docker exec -i "$BORG_CONTAINER" python3 - <<'PY' >> "$BODY_PATH"
import sqlite3

def fmt_bytes(value):
    if value is None:
        return "-"
    value = float(value)
    units = ["B", "KB", "MB", "GB", "TB"]
    for unit in units:
        if value < 1024 or unit == units[-1]:
            return f"{value:.1f} {unit}" if unit != "B" else f"{int(value)} B"
        value /= 1024

def fmt_sec(s):
    s = int(s)
    h, rem = divmod(s, 3600)
    m, sec = divmod(rem, 60)
    if h > 0:
        return f"{h}h {m}m"
    return f"{m}m {sec}s"

conn = sqlite3.connect("/data/borg.db")
conn.row_factory = sqlite3.Row
cur = conn.cursor()

print("### Letzte Backup-Jobs")
rows = cur.execute("""
    select id, status, started_at, completed_at, archive_name, nfiles,
           original_size, compressed_size, deduplicated_size, error_message
      from backup_jobs
     where started_at >= datetime('now', '-30 hours')
        or created_at >= datetime('now', '-30 hours')
     order by coalesce(started_at, created_at) desc
     limit 8
""").fetchall()

if not rows:
    print("- WARNUNG: Kein Backup-Job in den letzten 30 Stunden gefunden.")
else:
    print("| Zeit UTC | Status | Archiv | Dateien | Original | Dedupliziert |")
    print("|---|---:|---|---:|---:|---:|")
    for row in rows:
        archive = row["archive_name"] or "-"
        if len(archive) > 54:
            archive = archive[:51] + "..."
        print(
            f"| {row['started_at'] or row['completed_at'] or '-'} "
            f"| {row['status']} "
            f"| {archive} "
            f"| {row['nfiles'] if row['nfiles'] is not None else '-'} "
            f"| {fmt_bytes(row['original_size'])} "
            f"| {fmt_bytes(row['deduplicated_size'])} |"
        )
        if row["error_message"]:
            print(f"  - Fehler: {row['error_message'][:240]}")

print("")
print("### Zeitplan")
for row in cur.execute("""
    select name, enabled, last_run, next_run, cron_expression
      from scheduled_jobs
     order by id
"""):
    enabled = "aktiv" if row["enabled"] else "pausiert"
    print(f"- {row['name']}: {enabled}, last={row['last_run'] or '-'}, next={row['next_run'] or '-'}, cron=`{row['cron_expression']}`")

print("")
print("### Dauer-Drift (Median 14 Tage)")
duration_rows = cur.execute("""
    select started_at, completed_at,
           (julianday(completed_at) - julianday(started_at)) * 86400 as duration_seconds
      from backup_jobs
     where status = 'completed'
       and started_at is not null
       and completed_at is not null
       and completed_at >= datetime('now', '-14 days')
     order by completed_at desc
""").fetchall()

durations = [r["duration_seconds"] for r in duration_rows if r["duration_seconds"] and r["duration_seconds"] > 0]

if len(durations) < 3:
    print(f"- Zu wenig Datenpunkte fuer eine Drift-Bewertung (n={len(durations)}).")
else:
    durations_sorted = sorted(durations)
    median = durations_sorted[len(durations_sorted) // 2]
    latest = durations[0]
    ratio = latest / median if median > 0 else 0
    print(f"- Letzter Lauf: {fmt_sec(latest)}")
    print(f"- Median 14 Tage: {fmt_sec(median)} (n={len(durations)})")
    print(f"- Verhaeltnis: {ratio:.2f}x")
    if ratio > 2.0:
        print(f"- Bewertung: Drift erkannt - letzter Lauf {ratio:.1f}x langsamer als der Median. Quellgroesse, IO und Repo-Zustand pruefen.")
    else:
        print("- Bewertung: Backup-Dauer im erwarteten Bereich.")
PY
  then
    append "- WARNUNG: Borg-Auswertung fehlgeschlagen."
    set_summary "borg_status" "unknown"
    set_summary "backup_duration_drift" "unknown"
    record_section_error "borg" "Python-Auswertung in $BORG_CONTAINER fehlgeschlagen"
  else
    local borg_out borg_status borg_drift
    borg_out="$(docker exec -i "$BORG_CONTAINER" python3 - <<'PY' 2>/dev/null || true
import sqlite3
conn = sqlite3.connect("/data/borg.db")
conn.row_factory = sqlite3.Row
cur = conn.cursor()
status_row = cur.execute("""
    select status
      from backup_jobs
     order by coalesce(started_at, created_at) desc
     limit 1
""").fetchone()
status = status_row[0] if status_row else "missing"

duration_rows = cur.execute("""
    select (julianday(completed_at) - julianday(started_at)) * 86400 as ds
      from backup_jobs
     where status = 'completed'
       and started_at is not null
       and completed_at is not null
       and completed_at >= datetime('now', '-14 days')
     order by completed_at desc
""").fetchall()
durations = [r[0] for r in duration_rows if r[0] and r[0] > 0]
if len(durations) < 3:
    drift = "insufficient"
else:
    median = sorted(durations)[len(durations)//2]
    latest = durations[0]
    ratio = latest / median if median > 0 else 0
    drift = "1" if ratio > 2.0 else "0"

print(f"status={status}")
print(f"drift={drift}")
PY
)"
    borg_status="$(printf '%s' "$borg_out" | sed -n 's/^status=//p' | head -n 1)"
    borg_drift="$(printf '%s' "$borg_out" | sed -n 's/^drift=//p' | head -n 1)"
    if [ "${borg_drift:-}" = "1" ]; then
      set_summary "backup_duration_drift" "1"
    elif [ "${borg_drift:-}" = "0" ]; then
      set_summary "backup_duration_drift" "0"
    else
      set_summary "backup_duration_drift" "unknown"
    fi
    set_summary "borg_status" "${borg_status:-unknown}"
  fi

  append ""
}

collect_prometheus() {
  append "## Prometheus Alerts"
  append ""

  if ! have_container "$PROMETHEUS_CONTAINER"; then
    append "- WARNUNG: Container \`$PROMETHEUS_CONTAINER\` nicht gefunden."
    append ""
    set_summary "prometheus_alerts" "unknown"
    set_summary "prometheus_alerts_firing" "unknown"
    set_summary "prometheus_alerts_pending" "unknown"
    record_section_error "prometheus" "Container $PROMETHEUS_CONTAINER nicht gefunden"
    return
  fi

  local alerts
  alerts="$(docker exec "$PROMETHEUS_CONTAINER" wget -qO- http://localhost:9090/api/v1/alerts 2>/dev/null || true)"
  if [ -z "$alerts" ]; then
    append "- WARNUNG: Prometheus Alerts API nicht erreichbar."
    set_summary "prometheus_alerts" "unknown"
    set_summary "prometheus_alerts_firing" "unknown"
    set_summary "prometheus_alerts_pending" "unknown"
    record_section_error "prometheus" "Alerts-API leer oder nicht erreichbar"
  elif printf '%s' "$alerts" | grep -q '"alerts":\[\]'; then
    append "- Keine aktiven Alerts."
    set_summary "prometheus_alerts" "0"
    set_summary "prometheus_alerts_firing" "0"
    set_summary "prometheus_alerts_pending" "0"
  else
    local total firing pending
    total="$(printf '%s' "$alerts" | grep -o '"alertname":"[^"]*"' | count_lines)"
    firing="$(printf '%s' "$alerts" | grep -o '"state":"firing"' | count_lines)"
    pending="$(printf '%s' "$alerts" | grep -o '"state":"pending"' | count_lines)"
    append "- Aktive Alerts insgesamt: $total"
    append "- Davon firing: $firing"
    append "- Davon pending: $pending"
    append ""
    append "### Details"
    printf '%s' "$alerts" \
      | grep -o '"alertname":"[^"]*"\|"severity":"[^"]*"\|"instance":"[^"]*"\|"service":"[^"]*"\|"state":"[^"]*"' \
      | sed 's/^/  - /' >> "$BODY_PATH"
    set_summary "prometheus_alerts" "$total"
    set_summary "prometheus_alerts_firing" "$firing"
    set_summary "prometheus_alerts_pending" "$pending"
  fi
  append ""
}

collect_certificate_health() {
  append "## Zertifikate"
  append ""

  local cert_file="$TMP_DIR/certificates.tsv"
  local cert_sorted="$TMP_DIR/certificates.sorted.tsv"
  local warning_count=0
  local total_count=0
  : > "$cert_file"

  if [ ! -f "$TRAEFIK_ACME_PATH" ]; then
    append "- WARNUNG: Traefik ACME-Datei nicht gefunden: $TRAEFIK_ACME_PATH"
    set_summary "cert_warnings" "1"
    record_section_error "certificates" "ACME-Datei $TRAEFIK_ACME_PATH fehlt"
    append ""
    return
  fi

  if docker run -i --rm \
    -v "$TRAEFIK_ACME_PATH:/acme.json:ro" \
    python:3.13-alpine python - <<'PY' > "$cert_file"
import base64
import json
import ssl
import tempfile
from datetime import datetime, timezone

with open("/acme.json", "r", encoding="utf-8") as handle:
    data = json.load(handle)

now = datetime.now(timezone.utc)
for resolver in data.values():
    for cert in resolver.get("Certificates", []):
        domain = cert.get("domain", {}).get("main") or "-"
        sans = cert.get("domain", {}).get("sans") or []
        cert_b64 = cert.get("certificate")
        if not cert_b64:
            continue
        pem = base64.b64decode(cert_b64)
        with tempfile.NamedTemporaryFile(delete=False) as tmp:
            tmp.write(pem)
            tmp_path = tmp.name
        decoded = ssl._ssl._test_decode_cert(tmp_path)
        not_after = datetime.strptime(decoded["notAfter"], "%b %d %H:%M:%S %Y %Z").replace(tzinfo=timezone.utc)
        days = (not_after - now).days
        names = ", ".join([domain, *sans])
        print(f"{days}\t{not_after.date().isoformat()}\t{names}")
PY
  then
    if [ ! -s "$cert_file" ]; then
      append "- WARNUNG: Keine Zertifikate in ACME-Datei gefunden."
      warning_count=1
      record_section_error "certificates" "ACME-Datei enthielt keine Zertifikate"
    else
      sort -n "$cert_file" > "$cert_sorted"
      total_count="$(count_lines < "$cert_sorted")"
      append "- Zertifikate gesamt: $total_count"
      append "- Anzeige: die $CERT_MAX_ROWS Zertifikate mit der kuerzesten Restlaufzeit"
      append "- Schwelle Warnung: weniger als $CERT_WARN_DAYS Tage"
      append ""
      append "| Resttage | Ablaufdatum UTC | Domains |"
      append "|---:|---|---|"
      while IFS="$(printf '\t')" read -r days expires domains; do
        append "| $days | $expires | $domains |"
        if [ "${days:-0}" -lt "$CERT_WARN_DAYS" ]; then
          warning_count=$((warning_count + 1))
        fi
      done < <(head -n "$CERT_MAX_ROWS" "$cert_sorted")
      while IFS="$(printf '\t')" read -r days _expires _domains; do
        if [ "${days:-0}" -lt "$CERT_WARN_DAYS" ]; then
          warning_count=$((warning_count + 1))
        fi
      done < <(tail -n +"$((CERT_MAX_ROWS + 1))" "$cert_sorted")
      append ""
      if [ "$warning_count" -eq 0 ]; then
        append "Bewertung: Keine Zertifikate im kritischen Erneuerungsfenster unter $CERT_WARN_DAYS Tagen."
      else
        append "Bewertung: $warning_count Zertifikat(e) laufen in weniger als $CERT_WARN_DAYS Tagen ab und sollten beobachtet werden."
      fi
    fi
  else
    append "- WARNUNG: Zertifikate konnten nicht aus ACME-Datei gelesen werden."
    warning_count=1
    record_section_error "certificates" "Auswertung der ACME-Datei fehlgeschlagen"
  fi

  set_summary "cert_warnings" "$warning_count"
  append ""
}

collect_disk_health() {
  append "## Storage / Filesystem"
  append ""

  local disk_warnings=0
  local paths="/mnt/cache /mnt/disk1 /mnt/user /mnt/user/appdata /mnt/user/backups"

  append "- Schwelle Warnung: Nutzung ab ${DISK_USAGE_WARN_PCT}%"
  append ""
  append "| Pfad | Filesystem | Nutzung | Frei | Bewertung |"
  append "|---|---|---:|---:|---|"

  for path in $paths; do
    if [ ! -e "$path" ]; then
      append "| $path | - | - | - | fehlt |"
      disk_warnings=$((disk_warnings + 1))
      record_section_error "disk" "Kernpfad $path fehlt"
      continue
    fi

    local fstype usage avail verdict
    fstype="$(findmnt -T "$path" -no FSTYPE 2>/dev/null | head -n 1 || true)"
    usage="$(df -P "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"
    avail="$(df -hP "$path" 2>/dev/null | awk 'NR==2 { print $4 }')"
    verdict="ok"

    if ! printf '%s' "${usage:-}" | grep -Eq '^[0-9]+$'; then
      usage="-"
      verdict="unbekannt"
      disk_warnings=$((disk_warnings + 1))
    elif [ "$usage" -ge "$DISK_USAGE_WARN_PCT" ]; then
      verdict="Warnung: >=${DISK_USAGE_WARN_PCT}%"
      disk_warnings=$((disk_warnings + 1))
    fi

    append "| $path | ${fstype:-unbekannt} | ${usage}% | ${avail:-?} | $verdict |"
  done

  append ""
  if [ "$disk_warnings" -eq 0 ]; then
    append "Bewertung: Keine kritischen Fuellstaende oder fehlenden Kernpfade erkannt."
  else
    append "Bewertung: $disk_warnings Storage-/Filesystem-Punkt(e) brauchen Aufmerksamkeit."
  fi

  set_summary "disk_warnings" "$disk_warnings"
  append ""
}

collect_image_freshness() {
  append "## Image-Aktualitaet"
  append ""

  local image_file="$TMP_DIR/images.tsv"
  local image_warnings=0
  local now_epoch
  : > "$image_file"
  now_epoch="$(date +%s)"

  while IFS= read -r name; do
    [ -n "$name" ] || continue
    local image_id created_iso created_epoch age_days image_tag
    image_id="$(docker inspect --format '{{.Image}}' "$name" 2>/dev/null || true)"
    [ -n "$image_id" ] || continue
    created_iso="$(docker image inspect --format '{{.Created}}' "$image_id" 2>/dev/null || true)"
    image_tag="$(docker inspect --format '{{.Config.Image}}' "$name" 2>/dev/null || echo '?')"
    [ -n "$created_iso" ] || continue
    created_epoch="$(date -d "$created_iso" +%s 2>/dev/null || echo 0)"
    [ "$created_epoch" -gt 0 ] || continue
    age_days=$(( (now_epoch - created_epoch) / 86400 ))
    printf '%d\t%s\t%s\n' "$age_days" "$name" "$image_tag" >> "$image_file"
    if [ "$age_days" -ge "$IMAGE_AGE_WARN_DAYS" ]; then
      image_warnings=$((image_warnings + 1))
    fi
  done < <(docker ps --format '{{.Names}}')

  set_summary "image_warnings" "$image_warnings"

  if [ ! -s "$image_file" ]; then
    append "- Keine Image-Daten verfuegbar."
    record_section_error "images" "Keine Image-Daten ermittelt"
  else
    append "- Schwelle Warnung: Image aelter als $IMAGE_AGE_WARN_DAYS Tage"
    append "- Container mit Image >= $IMAGE_AGE_WARN_DAYS Tage: $image_warnings"
    append ""
    append "### Aelteste Images (Top 10)"
    append ""
    append "| Alter Tage | Container | Image |"
    append "|---:|---|---|"
    sort -nr "$image_file" | head -n 10 | while IFS="$(printf '\t')" read -r age name img; do
      append "| $age | $name | $img |"
    done
    append ""
    if [ "$image_warnings" -eq 0 ]; then
      append "Bewertung: Keine Container mit ueberalterten Images. CVE-Hygiene aus dieser Sicht ok."
    else
      append "Bewertung: $image_warnings Container nutzen Images aelter als $IMAGE_AGE_WARN_DAYS Tage. Update-Pipeline und CVE-Status pruefen."
    fi
  fi
  append ""
}

collect_container_events() {
  append "## Docker Events ($SINCE)"
  append ""

  local events_file="$TMP_DIR/docker-events.log"
  timeout 20 docker events \
    --since "$SINCE" \
    --until "$(date -Iseconds)" \
    --filter event=die \
    --filter event=oom \
    --filter event=kill \
    --filter event=restart \
    --format '{{.Time}}|{{.Actor.Attributes.name}}|{{.Action}}|{{.Actor.Attributes.exitCode}}|{{.Actor.Attributes.image}}' \
    | awk -F '|' '!(($3 == "die") && ($4 == "0")) { print }' \
    > "$events_file" 2>/dev/null || true

  local event_count
  event_count="$(count_lines < "$events_file")"
  set_summary "docker_events" "$event_count"

  if [ "$event_count" -eq 0 ]; then
    append '- Keine `die`/`oom`/`kill`/`restart` Events im Zeitraum.'
  else
    append "- Relevante Events: $event_count"
    append ""
    append '```text'
    tail -n 80 "$events_file" >> "$BODY_PATH"
    append '```'
  fi
  append ""
}

collect_container_state() {
  append "## Container-Zustand"
  append ""
  append "### Nicht laufende Container"
  local stopped_file="$TMP_DIR/stopped.log"
  docker ps -a --filter status=exited --filter status=dead --filter status=created --format '{{.Names}}\t{{.Status}}' > "$stopped_file"
  if [ ! -s "$stopped_file" ]; then
    append "- Keine."
  else
    append '```text'
    cat "$stopped_file" >> "$BODY_PATH"
    append '```'
  fi
  append ""

  append "### Container mit RestartCount > 0"
  local restart_file="$TMP_DIR/restarts.log"
  : > "$restart_file"
  while IFS= read -r name; do
    [ -n "$name" ] || continue
    local count
    count="$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo 0)"
    if [ "${count:-0}" -gt 0 ]; then
      printf '%s\t%s\n' "$name" "$count" >> "$restart_file"
    fi
  done < <(docker ps -a --format '{{.Names}}')

  if [ ! -s "$restart_file" ]; then
    append "- Keine."
  else
    append '```text'
    sort -k2,2nr "$restart_file" >> "$BODY_PATH"
    append '```'
  fi
  append ""
}

collect_traefik_5xx() {
  append "## Traefik 5xx ($SINCE)"
  append ""

  if ! have_container traefik; then
    append "- Traefik-Container nicht gefunden."
    append ""
    record_section_error "traefik" "Container traefik nicht gefunden"
    return
  fi

  local file="$TMP_DIR/traefik-5xx.log"
  docker logs --since "$SINCE" traefik 2>&1 \
    | awk '$9 ~ /^5[0-9][0-9]$/ { print }' \
    > "$file" || true

  local count
  count="$(count_lines < "$file")"
  set_summary "traefik_5xx" "$count"

  if [ "$count" -eq 0 ]; then
    append "- Keine 5xx-Antworten."
  else
    append "- 5xx-Antworten: $count"
    append ""
    append "### Gruppiert nach Service/Code"
    append '```text'
    awk '{ code=$9; service=$12; gsub(/"/, "", service); counts[service " " code]++ } END { for (k in counts) print counts[k], k }' "$file" | sort -nr >> "$BODY_PATH"
    append '```'
    append ""
    append "### Letzte Zeilen"
    append '```text'
    tail -n "$MAX_LOG_LINES" "$file" >> "$BODY_PATH"
    append '```'
  fi
  append ""
}

collect_log_highlights() {
  append "## Log-Auswertung ($SINCE)"
  append ""
  append "Ziel dieses Abschnitts ist nicht, Rohlogs zu wiederholen, sondern handlungsrelevante Auffaelligkeiten auszusortieren."
  append ""

  local hits="$TMP_DIR/log-hits.log"
  local attention="$TMP_DIR/log-attention.log"
  local known_noise="$TMP_DIR/log-known-noise.log"
  : > "$hits"
  : > "$attention"
  : > "$known_noise"

  while IFS= read -r name; do
    [ -n "$name" ] || continue
    docker logs --since "$SINCE" "$name" 2>&1 \
      | grep -Eai 'error|fatal|panic|exception|failed|denied|unauthorized|forbidden|oom' \
      | grep -Eavi 'level=info|levelname.: .INFO| 200 OK| 404 Not Found|healthcheck|probe_success' \
      | grep -Eavi 'production.DEBUG|stats_refresh_scheduler.*errors.: 0|Sync completed.*Failed: 0' \
      | sed -E 's/(refresh_token: )[A-Za-z0-9._-]+/\1[REDACTED]/Ig; s/(token: )[A-Za-z0-9._-]+/\1[REDACTED]/Ig; s/(Authorization: )[A-Za-z0-9._ -]+/\1[REDACTED]/Ig' \
      | sed "s/^/[$name] /" >> "$hits" || true
  done < <(docker ps --format '{{.Names}}')

  # Normalize the noise pattern file (drop comments, empty lines, trim
  # whitespace). An empty or whitespace-only pattern line would otherwise
  # make grep -Eaif match every hit and silently wipe the log highlights.
  local noise_normalized="$TMP_DIR/noise.patterns.normalized"
  : > "$noise_normalized"
  if [ -f "$NOISE_PATTERNS_FILE" ]; then
    if [ -x "$NORMALIZE_NOISE_SCRIPT" ]; then
      "$NORMALIZE_NOISE_SCRIPT" "$NOISE_PATTERNS_FILE" > "$noise_normalized" 2>/dev/null || : > "$noise_normalized"
    else
      record_section_error "log-highlights" "Normalize-Helper fehlt oder nicht ausfuehrbar: $NORMALIZE_NOISE_SCRIPT - Noise-Patterns ungenormt verwendet"
      # Fallback inline (same logic as the helper) so we still avoid the
      # "empty line matches all" trap.
      grep -Ev '^[[:space:]]*(#|$)' "$NOISE_PATTERNS_FILE" 2>/dev/null \
        | sed -E 's/^[[:space:]]+//; s/[[:space:]]+$//' \
        | grep -v '^$' > "$noise_normalized" || : > "$noise_normalized"
    fi
  else
    record_section_error "log-highlights" "Noise-Pattern-Datei $NOISE_PATTERNS_FILE fehlt - alle Treffer gelten als handlungsrelevant"
  fi

  if [ -s "$hits" ]; then
    if [ -s "$noise_normalized" ]; then
      grep -Eaif "$noise_normalized" "$hits" > "$known_noise" || true
    fi
    if [ -s "$known_noise" ]; then
      # Normalisierung gegen abweichende Whitespace-Enden
      sed -E 's/[[:space:]]+$//' "$known_noise" > "$known_noise.norm"
      sed -E 's/[[:space:]]+$//' "$hits" > "$hits.norm"
      grep -Fvxf "$known_noise.norm" "$hits.norm" > "$attention" || true
    else
      cp "$hits" "$attention"
    fi
  fi

  # Per-container noise breakdown (always computed, even if SHOW_KNOWN_NOISE=0).
  local noise_by_container="$TMP_DIR/noise-by-container.tsv"
  : > "$noise_by_container"
  if [ -s "$known_noise" ]; then
    awk -F '[][]' '{ counts[$2]++ } END { for (n in counts) print counts[n] "\t" n }' "$known_noise" \
      | sort -nr > "$noise_by_container"
  fi

  # Per-pattern noise breakdown: count how often each pattern hit in $hits.
  # Note: a single hit line may match multiple patterns; counts can overlap.
  local noise_by_pattern="$TMP_DIR/noise-by-pattern.tsv"
  : > "$noise_by_pattern"
  if [ -s "$noise_normalized" ] && [ -s "$hits" ]; then
    while IFS= read -r p; do
      [ -n "$p" ] || continue
      local pcount
      pcount="$(grep -Eaic -- "$p" "$hits" 2>/dev/null || true)"
      if [ "${pcount:-0}" -gt 0 ]; then
        printf '%d\t%s\n' "$pcount" "$p" >> "$noise_by_pattern"
      fi
    done < "$noise_normalized"
    if [ -s "$noise_by_pattern" ]; then
      sort -nr -o "$noise_by_pattern" "$noise_by_pattern"
    fi
  fi

  # Threshold escalation: how many patterns produced more than the threshold?
  local noise_threshold_exceeded=0
  if [ -s "$noise_by_pattern" ]; then
    noise_threshold_exceeded="$(awk -v t="$NOISE_ESCALATION_THRESHOLD" '$1 > t { n++ } END { print n + 0 }' "$noise_by_pattern")"
  fi
  set_summary "noise_threshold_exceeded" "$noise_threshold_exceeded"

  local hit_count attention_count known_noise_count
  hit_count="$(count_lines < "$hits")"
  attention_count="$(count_lines < "$attention")"
  known_noise_count="$(count_lines < "$known_noise")"
  set_summary "log_highlights" "$attention_count"
  set_summary "log_hits_total" "$hit_count"
  set_summary "log_known_noise" "$known_noise_count"

  if [ "$hit_count" -eq 0 ]; then
    append "- Keine auffaelligen Logmuster gefunden."
  else
    append "- Gefundene Logmuster insgesamt: $hit_count"
    append "- Davon als bekanntes Rauschen eingeordnet: $known_noise_count"
    append "- Handlungsrelevante Logmuster: $attention_count"
    append "- Noise-Pattern-Quelle: \`$NOISE_PATTERNS_FILE\`"
    append "- Eskalations-Schwelle pro Pattern: $NOISE_ESCALATION_THRESHOLD"
    if [ "$noise_threshold_exceeded" -gt 0 ]; then
      append "- WARNUNG: $noise_threshold_exceeded Pattern ueberschreit(en) die Schwelle - bitte pruefen ob noch wirklich Noise."
    fi
    append ""

    if [ "$attention_count" -eq 0 ]; then
      append "Bewertung: Keine handlungsrelevanten Logmuster. Die Treffer bestehen aus bekannten, aktuell nicht kritischen Meldungen."
    else
      append "Bewertung: Es gibt Logmuster, die nicht automatisch als bekanntes Rauschen eingeordnet wurden. Diese sollten geprueft werden."
      append ""
      append "### Betroffene Container"
      append ""
      append "| Container | Anzahl |"
      append "|---|---:|"
      awk -F '[][]' '{ counts[$2]++ } END { for (name in counts) print "| " name " | " counts[name] " |" }' "$attention" | sort >> "$BODY_PATH"
      append ""
      append "### Beispiele"
      append ""
      append '```text'
      awk -F '[][]' '
        {
          name=$2
          if (seen[name] < 3) {
            line=$0
            gsub(/[[:space:]]+/, " ", line)
            if (length(line) > 220) line=substr(line, 1, 217) "..."
            print line
            seen[name]++
          }
        }
      ' "$attention" | head -n "$MAX_LOG_LINES" >> "$BODY_PATH"
      append '```'
    fi

    if [ "$known_noise_count" -gt 0 ]; then
      append ""
      append "### Bekanntes Rauschen (Top)"
      append ""
      if [ -s "$noise_by_container" ]; then
        append "#### Container mit den meisten Noise-Treffern"
        append ""
        append "| Container | Anzahl |"
        append "|---|---:|"
        head -n "$NOISE_BREAKDOWN_TOP_N" "$noise_by_container" \
          | while IFS="$(printf '\t')" read -r cnt cname; do
              append "| ${cname:-?} | $cnt |"
            done
        append ""
      fi
      if [ -s "$noise_by_pattern" ]; then
        append "#### Pattern mit den meisten Treffern"
        append ""
        append "| Pattern | Anzahl |"
        append "|---|---:|"
        head -n "$NOISE_BREAKDOWN_TOP_N" "$noise_by_pattern" \
          | while IFS="$(printf '\t')" read -r cnt pat; do
              local short="$pat"
              if [ "${#short}" -gt 80 ]; then
                short="${short:0:77}..."
              fi
              # Escape pipe characters that would break the markdown table.
              short="${short//|/\\|}"
              append "| \`$short\` | $cnt |"
            done
        append ""
      fi
      if [ "$noise_threshold_exceeded" -gt 0 ]; then
        append "Bewertung: $noise_threshold_exceeded Pattern ueberschreit(en) die Eskalations-Schwelle ($NOISE_ESCALATION_THRESHOLD). Bitte pruefen, ob die als Noise eingeordneten Meldungen noch fachlich Noise sind oder ob sich ein echter Vorfall darunter versteckt."
      else
        append "Bewertung: Kein Pattern ueberschreitet die Eskalations-Schwelle ($NOISE_ESCALATION_THRESHOLD)."
      fi
    fi

    if [ "$known_noise_count" -gt 0 ] && [ "$SHOW_KNOWN_NOISE" = "1" ]; then
      append ""
      append "### Ausgeblendetes bekanntes Rauschen (Top 50 Zeilen)"
      append ""
      append '```text'
      head -n 50 "$known_noise" >> "$BODY_PATH"
      append '```'
    fi
  fi
  append ""
}

collect_log_volume() {
  append "## Log-Volumen ($SINCE)"
  append ""

  local volume_file="$TMP_DIR/log-volume.tsv"
  : > "$volume_file"

  while IFS= read -r name; do
    [ -n "$name" ] || continue
    local count
    count="$(docker logs --since "$SINCE" "$name" 2>&1 | count_lines)"
    printf '%d\t%s\n' "$count" "$name" >> "$volume_file"
  done < <(docker ps --format '{{.Names}}')

  local total
  total="$(awk '{ s += $1 } END { print s + 0 }' "$volume_file")"
  set_summary "log_volume_total" "$total"

  if [ "$total" -eq 0 ]; then
    append "- Keine Logzeilen im Zeitraum (unwahrscheinlich, evtl. Datenquelle pruefen)."
    record_section_error "log-volume" "Log-Volumen ueber alle Container ist 0"
  else
    append "- Zeilen insgesamt im Zeitraum: $total"
    append ""
    append "### Top $LOG_VOLUME_TOP_N lauteste Container"
    append ""
    append "| Container | Zeilen |"
    append "|---|---:|"
    sort -nr "$volume_file" | head -n "$LOG_VOLUME_TOP_N" | while IFS="$(printf '\t')" read -r c n; do
      append "| $n | $c |"
    done
    append ""
    append "Bewertung: Auffaellig laute Container sind oft ein Frueh-Indikator fuer Endlosschleifen, schlecht konfigurierte Loglevel oder Probe-Spam."
  fi
  append ""
}

collect_diff_yesterday() {
  append "## Vergleich mit gestern"
  append ""

  local yesterday yesterday_summary
  yesterday="$(date -d 'yesterday' +%F 2>/dev/null || true)"
  yesterday_summary="$REPORT_DIR/summary-$yesterday.env"

  if [ -z "$yesterday" ] || [ ! -f "$yesterday_summary" ]; then
    append "- Keine Vortagsdaten verfuegbar ($yesterday_summary)."
    append ""
    return
  fi

  local prev_borg= prev_alerts= prev_firing= prev_pending= prev_unhealthy= prev_exited= prev_5xx= prev_events= prev_log= prev_certs= prev_disk= prev_img= prev_drift= prev_vol=
  while IFS='=' read -r key value; do
    case "$key" in
      borg_status) prev_borg="$value" ;;
      prometheus_alerts) prev_alerts="$value" ;;
      prometheus_alerts_firing) prev_firing="$value" ;;
      prometheus_alerts_pending) prev_pending="$value" ;;
      containers_unhealthy) prev_unhealthy="$value" ;;
      containers_exited_nonzero) prev_exited="$value" ;;
      traefik_5xx) prev_5xx="$value" ;;
      docker_events) prev_events="$value" ;;
      log_highlights) prev_log="$value" ;;
      cert_warnings) prev_certs="$value" ;;
      disk_warnings) prev_disk="$value" ;;
      image_warnings) prev_img="$value" ;;
      backup_duration_drift) prev_drift="$value" ;;
      log_volume_total) prev_vol="$value" ;;
    esac
  done < "$yesterday_summary"

  # shellcheck disable=SC1090
  . "$SUMMARY_PATH"

  append "Vergleich des Datums $REPORT_DATE mit $yesterday."
  append ""
  append "| Metrik | Heute | Gestern |"
  append "|---|---:|---:|"
  append "| Borg Status | ${borg_status:-?} | ${prev_borg:-?} |"
  append "| Prometheus Alerts gesamt | ${prometheus_alerts:-?} | ${prev_alerts:-?} |"
  append "| Prometheus firing | ${prometheus_alerts_firing:-?} | ${prev_firing:-?} |"
  append "| Prometheus pending | ${prometheus_alerts_pending:-?} | ${prev_pending:-?} |"
  append "| Container unhealthy | ${containers_unhealthy:-?} | ${prev_unhealthy:-?} |"
  append "| Container exited non-zero | ${containers_exited_nonzero:-?} | ${prev_exited:-?} |"
  append "| Docker Events | ${docker_events:-?} | ${prev_events:-?} |"
  append "| Traefik 5xx | ${traefik_5xx:-?} | ${prev_5xx:-?} |"
  append "| Log-Highlights | ${log_highlights:-?} | ${prev_log:-?} |"
  append "| Log-Volumen | ${log_volume_total:-?} | ${prev_vol:-?} |"
  append "| Zertifikatswarnungen | ${cert_warnings:-?} | ${prev_certs:-?} |"
  append "| Storage-Warnungen | ${disk_warnings:-?} | ${prev_disk:-?} |"
  append "| Image-Warnungen | ${image_warnings:-?} | ${prev_img:-?} |"
  append "| Backup-Dauer-Drift | ${backup_duration_drift:-?} | ${prev_drift:-?} |"
  append ""

  local notable=0
  if [ "${containers_exited_nonzero:-0}" != "${prev_exited:-0}" ] || \
     [ "${containers_unhealthy:-0}" != "${prev_unhealthy:-0}" ] || \
     [ "${prometheus_alerts_firing:-0}" != "${prev_firing:-0}" ] || \
     [ "${prometheus_alerts_pending:-0}" != "${prev_pending:-0}" ] || \
     [ "${log_highlights:-0}" != "${prev_log:-0}" ] || \
     [ "${borg_status:-unknown}" != "${prev_borg:-unknown}" ] || \
     [ "${backup_duration_drift:-0}" != "${prev_drift:-0}" ]; then
    notable=1
  fi

  if [ "$notable" -eq 0 ]; then
    append "Bewertung: Keine relevanten Aenderungen gegenueber gestern."
  else
    append "Bewertung: Relevante Aenderungen gegenueber gestern. Details bitte in den einzelnen Abschnitten pruefen."
  fi
  append ""
}

collect_self_health() {
  append "## Self-Health"
  append ""

  local script_duration section_failures
  script_duration=$(( $(date +%s) - SCRIPT_START ))
  section_failures="$(count_lines < "$SECTION_ERRORS_FILE")"

  set_summary "script_duration_seconds" "$script_duration"
  set_summary "section_failures" "$section_failures"

  append "- Skript-Laufzeit: $(format_duration "$script_duration") (${script_duration}s)"
  append "- Sektionen mit Fehlern: $section_failures"
  append "- Noise-Pattern-Datei vorhanden: $([ -f "$NOISE_PATTERNS_FILE" ] && echo ja || echo nein)"
  append "- Lock-Datei: \`$LOCK_FILE\`"

  if [ "$section_failures" -gt 0 ]; then
    append ""
    append "### Fehlerhafte Sektionen"
    append ""
    while IFS= read -r line; do
      append "- $line"
    done < "$SECTION_ERRORS_FILE"
  fi
  append ""
}

write_report() {
  mkdir -p "$REPORT_DIR"

  # shellcheck disable=SC1090
  . "$SUMMARY_PATH"

  {
    printf '# Homelab Operations Report - %s\n\n' "$REPORT_DATE"
    printf '%s\n' "- Erstellt: \`$(date -Iseconds)\`"
    printf '%s\n' "- Zeitraum: letzte \`$SINCE\`"
    printf '%s\n' "- Host: \`$(hostname)\`"
    printf '%s\n\n' "- Gesamtbewertung: \`$REPORT_STATUS\`"
    printf '## Executive Summary\n\n'
    if [ "$REPORT_STATUS" = "OK" ]; then
      printf 'Im betrachteten Zeitraum zeigt das Homelab eine stabile Betriebslage. Das letzte Borg-Backup ist erfolgreich abgeschlossen, Prometheus meldet keine firing Alerts, keine unhealthy Container, Zertifikate und Storage im erwarteten Bereich.\n\n'
    elif [ "$REPORT_STATUS" = "WARNUNG" ]; then
      printf 'Im betrachteten Zeitraum gibt es Punkte, die Aufmerksamkeit verdienen. Der Betrieb ist nicht automatisch als kompromittiert zu bewerten, aber mindestens ein Signal (Backup, Pending Alert, Zertifikat, Storage, Image-Alter, Drift oder Reboot) weicht vom Normalzustand ab.\n\n'
    else
      printf 'Im betrachteten Zeitraum liegt ein kritisches Betriebssignal vor. Der Bericht sollte zeitnah gelesen und die betroffenen Komponenten priorisiert geprueft werden.\n\n'
    fi
    printf '### Management-Bewertung\n\n'
    printf '%s\n' "- Status: \`$REPORT_STATUS\`"
    printf '%s\n' "- Borg Backup: \`${borg_status:-unknown}\`"
    printf '%s\n' "- Backup-Dauer-Drift: \`${backup_duration_drift:-unknown}\`"
    printf '%s\n' "- Prometheus Alerts (gesamt/firing/pending): \`${prometheus_alerts:-unknown}\` / \`${prometheus_alerts_firing:-unknown}\` / \`${prometheus_alerts_pending:-unknown}\`"
    printf '%s\n' "- Container unhealthy: \`${containers_unhealthy:-unknown}\`"
    printf '%s\n' "- Container exited non-zero: \`${containers_exited_nonzero:-unknown}\`"
    printf '%s\n' "- Docker Critical Events: \`${docker_events:-unknown}\`"
    printf '%s\n' "- Traefik 5xx: \`${traefik_5xx:-unknown}\`"
    printf '%s\n' "- Zertifikatswarnungen: \`${cert_warnings:-unknown}\`"
    printf '%s\n' "- Storage-Warnungen: \`${disk_warnings:-unknown}\`"
    printf '%s\n' "- Image-Warnungen: \`${image_warnings:-unknown}\`"
    printf '%s\n' "- Log-Highlights: \`${log_highlights:-unknown}\`"
    printf '%s\n' "- Noise-Pattern ueber Schwelle: \`${noise_threshold_exceeded:-0}\`"
    printf '%s\n' "- Log-Volumen gesamt: \`${log_volume_total:-unknown}\`"
    printf '%s\n' "- Reboot in letzten 24h: \`${host_recent_boot:-unknown}\`"
    printf '%s\n\n' "- Sektionsfehler im Skript: \`${section_failures:-unknown}\`"
    printf '### Einordnung\n\n'
    printf 'Dieser Report ist ein Management-Lagebericht: Er verdichtet Backup-Status, Container-Zustand, Monitoring-Alerts, Traefik-Fehler, Zertifikate, Storage, Image-Aktualitaet, Log-Volumen und Drift-Indikatoren. Rohlogs werden nur ausschnittsweise gezeigt, damit der Bericht lesbar bleibt und trotzdem nachvollziehbar ist.\n\n'
    cat "$BODY_PATH"
    printf '## Schlussbewertung\n\n'
    if [ "$REPORT_STATUS" = "OK" ]; then
      printf 'Das Homelab war im betrachteten Zeitraum betriebsfaehig und ohne akute Warnsignale. Es besteht aus diesem Report heraus kein unmittelbarer Handlungsdruck.\n'
    elif [ "$REPORT_STATUS" = "WARNUNG" ]; then
      printf 'Das Homelab war grundsaetzlich betriebsfaehig, zeigt aber mindestens eine Auffaelligkeit. Die im Bericht genannten Punkte sollten geprueft und bei Wiederholung nachverfolgt werden.\n'
    else
      printf 'Das Homelab zeigt ein kritisches Signal. Die betroffenen Dienste, Backup-Lage und firing Alerts sollten sofort geprueft werden.\n'
    fi
  } > "$REPORT_PATH.tmp"
  mv "$REPORT_PATH.tmp" "$REPORT_PATH"

  cp "$SUMMARY_PATH" "$PERSISTENT_SUMMARY_PATH.tmp"
  mv "$PERSISTENT_SUMMARY_PATH.tmp" "$PERSISTENT_SUMMARY_PATH"
}

send_report_mail() {
  [ "$SEND_MAIL" = "1" ] || return 0
  [ -x "$MAIL_SCRIPT" ] || {
    echo "Mail script missing or not executable: $MAIL_SCRIPT" >&2
    record_section_error "mail" "Mail-Skript $MAIL_SCRIPT fehlt oder nicht ausfuehrbar"
    return 1
  }

  case "$MAIL_MODE:$REPORT_STATUS" in
    always:*|warning:WARNUNG|warning:KRITISCH|critical:KRITISCH)
      "$MAIL_SCRIPT" "$REPORT_PATH" "$REPORT_STATUS"
      ;;
    always:*|warning:*|critical:*)
      # Mode bekannt, aber Status loest keinen Versand aus
      ;;
    *)
      echo "Unknown MAIL_MODE '$MAIL_MODE' - mail not sent. Use always|warning|critical." >&2
      record_section_error "mail" "Unbekanntes MAIL_MODE '$MAIL_MODE'"
      return 1
      ;;
  esac
}

send_summary_ntfy() {
  [ "$SEND_NTFY" = "1" ] || return 0
  [ -x "$NTFY_SCRIPT" ] || return 0

  # shellcheck disable=SC1090
  . "$SUMMARY_PATH"

  local title="Homelab Tagesprotokoll: ${REPORT_STATUS:-unknown} / borg=${borg_status:-unknown}"
  local priority="default"
  local body="Report: $REPORT_PATH
Status: $REPORT_STATUS
Container: ${containers_running:-?}/${containers_total:-?} running, unhealthy=${containers_unhealthy:-?}, exited_nonzero=${containers_exited_nonzero:-?}
Borg: ${borg_status:-unknown} (drift=${backup_duration_drift:-unknown})
Prometheus alerts (total/firing/pending): ${prometheus_alerts:-unknown}/${prometheus_alerts_firing:-unknown}/${prometheus_alerts_pending:-unknown}
Docker events: ${docker_events:-unknown}
Traefik 5xx: ${traefik_5xx:-unknown}
Certs warn: ${cert_warnings:-unknown}
Disk warn: ${disk_warnings:-unknown}
Image warn: ${image_warnings:-unknown}
Log highlights: ${log_highlights:-unknown}
Log volume: ${log_volume_total:-unknown}
Recent boot: ${host_recent_boot:-unknown}
Section errors: ${section_failures:-unknown}"

  case "$REPORT_STATUS" in
    KRITISCH) priority="urgent" ;;
    WARNUNG) priority="high" ;;
    *) priority="default" ;;
  esac

  "$NTFY_SCRIPT" "$NTFY_TOPIC" "$title" "$body" "$priority" || true
}

main() {
  collect_overview
  collect_host_health
  collect_borg
  collect_prometheus
  collect_certificate_health
  collect_disk_health
  collect_image_freshness
  collect_container_events
  collect_container_state
  collect_traefik_5xx
  collect_log_highlights
  collect_log_volume
  collect_diff_yesterday
  derive_report_status
  collect_self_health
  write_report
  send_report_mail
  send_summary_ntfy

  printf '%s\n' "$REPORT_PATH"
}

main "$@"

case "$REPORT_STATUS" in
  KRITISCH) exit 2 ;;
  WARNUNG) exit 1 ;;
  *) exit 0 ;;
esac