diff --git a/services/posture-check/daily-status-report.sh b/services/posture-check/daily-status-report.sh new file mode 100644 index 0000000..319e4f9 --- /dev/null +++ b/services/posture-check/daily-status-report.sh @@ -0,0 +1,1201 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_START="$(date +%s)" + +REPORT_DIR="${REPORT_DIR:-/mnt/user/services/posture-check/daily-reports}" +REPORT_DATE="${REPORT_DATE:-$(date +%F)}" +REPORT_PATH="${REPORT_PATH:-$REPORT_DIR/homelab-day-$REPORT_DATE.md}" +PERSISTENT_SUMMARY_PATH="${PERSISTENT_SUMMARY_PATH:-$REPORT_DIR/summary-$REPORT_DATE.env}" +SINCE="${SINCE:-24h}" +MAX_LOG_LINES="${MAX_LOG_LINES:-80}" +CERT_MAX_ROWS="${CERT_MAX_ROWS:-12}" +IMAGE_AGE_WARN_DAYS="${IMAGE_AGE_WARN_DAYS:-180}" +LOG_VOLUME_TOP_N="${LOG_VOLUME_TOP_N:-10}" +DISK_USAGE_WARN_PCT="${DISK_USAGE_WARN_PCT:-85}" +CERT_WARN_DAYS="${CERT_WARN_DAYS:-21}" +BACKUP_DRIFT_FACTOR="${BACKUP_DRIFT_FACTOR:-2.0}" +SHOW_KNOWN_NOISE="${SHOW_KNOWN_NOISE:-0}" +SEND_MAIL="${SEND_MAIL:-0}" +MAIL_MODE="${MAIL_MODE:-always}" +MAIL_SCRIPT="${MAIL_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/send-operations-report-mail.sh}" +SEND_NTFY="${SEND_NTFY:-0}" +NTFY_TOPIC="${NTFY_TOPIC:-homelab-info}" +NTFY_SCRIPT="${NTFY_SCRIPT:-/mnt/user/services/homelab-infra/ops/restore-tests/send-ntfy.sh}" +BORG_CONTAINER="${BORG_CONTAINER:-borg-ui}" +PROMETHEUS_CONTAINER="${PROMETHEUS_CONTAINER:-monitoring-prometheus}" +TRAEFIK_ACME_PATH="${TRAEFIK_ACME_PATH:-/mnt/user/appdata/traefik/letsencrypt/acme.json}" +NOISE_PATTERNS_FILE="${NOISE_PATTERNS_FILE:-/mnt/user/services/homelab-infra/services/posture-check/log-noise.patterns}" +NORMALIZE_NOISE_SCRIPT="${NORMALIZE_NOISE_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/lib/normalize-noise-patterns.sh}" +NOISE_ESCALATION_THRESHOLD="${NOISE_ESCALATION_THRESHOLD:-500}" +NOISE_BREAKDOWN_TOP_N="${NOISE_BREAKDOWN_TOP_N:-10}" +POSTURE_CHECK_FILE="${POSTURE_CHECK_FILE:-/mnt/user/services/posture-check/last.json}" +LOCK_FILE="${LOCK_FILE:-/tmp/homelab-daily-report.lock}" +REPORT_STATUS="UNKNOWN" + +exec 9>"$LOCK_FILE" +if ! flock -n 9; then + echo "Another daily-status-report run is already in progress (lock: $LOCK_FILE)" >&2 + exit 3 +fi + +TMP_DIR="$(mktemp -d /tmp/homelab-daily-report.XXXXXX)" +BODY_PATH="$TMP_DIR/body.md" +SUMMARY_PATH="$TMP_DIR/summary.env" +SECTION_ERRORS_FILE="$TMP_DIR/section-errors.log" +: > "$BODY_PATH" +: > "$SUMMARY_PATH" +: > "$SECTION_ERRORS_FILE" + +cleanup() { + rm -rf "$TMP_DIR" +} +trap cleanup EXIT + +append() { + printf '%s\n' "$*" >> "$BODY_PATH" +} + +append_block() { + cat >> "$BODY_PATH" +} + +set_summary() { + printf '%s=%s\n' "$1" "$2" >> "$SUMMARY_PATH" +} + +record_section_error() { + printf '%s: %s\n' "$1" "$2" >> "$SECTION_ERRORS_FILE" +} + +have_container() { + docker inspect "$1" >/dev/null 2>&1 +} + +count_lines() { + wc -l | awk '{ print $1 + 0 }' +} + +shorten() { + sed -E 's/[[:space:]]+/ /g' | cut -c 1-260 +} + +format_duration() { + local s="${1:-0}" + if ! printf '%s' "$s" | grep -Eq '^[0-9]+$'; then + printf '?\n' + return + fi + local d=$(( s / 86400 )) + local h=$(( (s % 86400) / 3600 )) + local m=$(( (s % 3600) / 60 )) + local sec=$(( s % 60 )) + if [ "$d" -gt 0 ]; then + printf '%d Tage %d Stunden\n' "$d" "$h" + elif [ "$h" -gt 0 ]; then + printf '%d Stunden %d Minuten\n' "$h" "$m" + elif [ "$m" -gt 0 ]; then + printf '%d Minuten %d Sekunden\n' "$m" "$sec" + else + printf '%d Sekunden\n' "$sec" + fi +} + +collect_overview() { + local running total unhealthy exited_nonzero + + total="$(docker ps -a --format '{{.Names}}' | count_lines)" + running="$(docker ps --format '{{.Names}}' | count_lines)" + unhealthy="$(docker ps --filter health=unhealthy --format '{{.Names}}' | count_lines)" + exited_nonzero="$(docker ps -a --filter status=exited --format '{{.Names}} {{.Status}}' | awk '!/Exited \(0\)/ { count++ } END { print count + 0 }')" + + set_summary "containers_total" "$total" + set_summary "containers_running" "$running" + set_summary "containers_unhealthy" "$unhealthy" + set_summary "containers_exited_nonzero" "$exited_nonzero" + + append "## Betriebslage" + append "" + append "- Container: $running/$total laufen" + append "- Unhealthy Container: $unhealthy" + append "- Exited non-zero Container: $exited_nonzero" + + if [ -f "$POSTURE_CHECK_FILE" ]; then + local posture_status posture_age now_epoch + posture_status="$(sed -n 's/.*"status": *"\([^"]*\)".*/\1/p' "$POSTURE_CHECK_FILE" | head -n 1)" + now_epoch="$(date +%s)" + posture_age=$(( now_epoch - $(stat -c %Y "$POSTURE_CHECK_FILE" 2>/dev/null || echo "$now_epoch") )) + append "- Letzter Posture-Check: ${posture_status:-unbekannt} (Datei ist $(format_duration "$posture_age") alt)" + set_summary "posture_status" "${posture_status:-unknown}" + set_summary "posture_age_seconds" "$posture_age" + else + append "- Letzter Posture-Check: keine Datei gefunden" + set_summary "posture_status" "missing" + record_section_error "overview" "Posture-Check-Datei $POSTURE_CHECK_FILE fehlt" + fi + append "" +} + +collect_host_health() { + append "## Host" + append "" + + local boot_epoch boot_iso uptime_seconds load_1 load_5 load_15 now_epoch + + now_epoch="$(date +%s)" + boot_epoch="$(awk '/^btime/ { print $2 }' /proc/stat 2>/dev/null || echo 0)" + if [ "${boot_epoch:-0}" -gt 0 ]; then + boot_iso="$(date -u -d "@$boot_epoch" -Iseconds 2>/dev/null || echo unknown)" + uptime_seconds=$(( now_epoch - boot_epoch )) + else + boot_iso="unknown" + uptime_seconds=0 + record_section_error "host" "/proc/stat btime nicht lesbar" + fi + if [ "$uptime_seconds" -lt 0 ]; then + uptime_seconds=0 + fi + + if [ -r /proc/loadavg ]; then + read -r load_1 load_5 load_15 _ < /proc/loadavg + else + load_1="?"; load_5="?"; load_15="?" + fi + + append "- Hostname: \`$(hostname)\`" + append "- Boot-Zeit: \`$boot_iso\`" + append "- Uptime: $(format_duration "$uptime_seconds")" + append "- Load average (1/5/15): $load_1 / $load_5 / $load_15" + if [ "$uptime_seconds" -lt 86400 ]; then + append "- WARNUNG: Boot innerhalb der letzten 24 Stunden erkannt." + set_summary "host_recent_boot" "1" + else + append "- Reboot in den letzten 24h: nein" + set_summary "host_recent_boot" "0" + fi + + set_summary "host_uptime_seconds" "$uptime_seconds" + set_summary "host_load_1" "$load_1" + append "" +} + +derive_report_status() { + # shellcheck disable=SC1090 + . "$SUMMARY_PATH" + + REPORT_STATUS="OK" + local has_warn=0 has_crit=0 + + [ "${borg_status:-unknown}" != "completed" ] && has_warn=1 + [ "${prometheus_alerts:-0}" = "unknown" ] && has_warn=1 + [ "${cert_warnings:-0}" != "0" ] && has_warn=1 + [ "${disk_warnings:-0}" != "0" ] && has_warn=1 + [ "${image_warnings:-0}" != "0" ] && has_warn=1 + [ "${containers_exited_nonzero:-0}" != "0" ] && has_warn=1 + [ "${host_recent_boot:-0}" = "1" ] && has_warn=1 + [ "${backup_duration_drift:-0}" = "1" ] && has_warn=1 + [ "${noise_threshold_exceeded:-0}" != "0" ] && has_warn=1 + if [ "${prometheus_alerts_pending:-0}" != "0" ] && [ "${prometheus_alerts_pending:-0}" != "unknown" ]; then + has_warn=1 + fi + + [ "${borg_status:-unknown}" = "failed" ] && has_crit=1 + [ "${borg_status:-unknown}" = "error" ] && has_crit=1 + [ "${containers_unhealthy:-0}" != "0" ] && has_crit=1 + if [ "${prometheus_alerts_firing:-0}" != "0" ] && [ "${prometheus_alerts_firing:-0}" != "unknown" ]; then + has_crit=1 + fi + + if [ "$has_crit" -eq 1 ]; then + REPORT_STATUS="KRITISCH" + elif [ "$has_warn" -eq 1 ]; then + REPORT_STATUS="WARNUNG" + fi + + set_summary "report_status" "$REPORT_STATUS" +} + +collect_borg() { + append "## Borg Backup" + append "" + + if ! have_container "$BORG_CONTAINER"; then + append "- WARNUNG: Container \`$BORG_CONTAINER\` nicht gefunden." + append "" + set_summary "borg_status" "unknown" + set_summary "backup_duration_drift" "unknown" + record_section_error "borg" "Container $BORG_CONTAINER nicht gefunden" + return + fi + + if ! docker exec -i "$BORG_CONTAINER" python3 - <<'PY' >> "$BODY_PATH" +import sqlite3 + +def fmt_bytes(value): + if value is None: + return "-" + value = float(value) + units = ["B", "KB", "MB", "GB", "TB"] + for unit in units: + if value < 1024 or unit == units[-1]: + return f"{value:.1f} {unit}" if unit != "B" else f"{int(value)} B" + value /= 1024 + +def fmt_sec(s): + s = int(s) + h, rem = divmod(s, 3600) + m, sec = divmod(rem, 60) + if h > 0: + return f"{h}h {m}m" + return f"{m}m {sec}s" + +conn = sqlite3.connect("/data/borg.db") +conn.row_factory = sqlite3.Row +cur = conn.cursor() + +print("### Letzte Backup-Jobs") +rows = cur.execute(""" + select id, status, started_at, completed_at, archive_name, nfiles, + original_size, compressed_size, deduplicated_size, error_message + from backup_jobs + where started_at >= datetime('now', '-30 hours') + or created_at >= datetime('now', '-30 hours') + order by coalesce(started_at, created_at) desc + limit 8 +""").fetchall() + +if not rows: + print("- WARNUNG: Kein Backup-Job in den letzten 30 Stunden gefunden.") +else: + print("| Zeit UTC | Status | Archiv | Dateien | Original | Dedupliziert |") + print("|---|---:|---|---:|---:|---:|") + for row in rows: + archive = row["archive_name"] or "-" + if len(archive) > 54: + archive = archive[:51] + "..." + print( + f"| {row['started_at'] or row['completed_at'] or '-'} " + f"| {row['status']} " + f"| {archive} " + f"| {row['nfiles'] if row['nfiles'] is not None else '-'} " + f"| {fmt_bytes(row['original_size'])} " + f"| {fmt_bytes(row['deduplicated_size'])} |" + ) + if row["error_message"]: + print(f" - Fehler: {row['error_message'][:240]}") + +print("") +print("### Zeitplan") +for row in cur.execute(""" + select name, enabled, last_run, next_run, cron_expression + from scheduled_jobs + order by id +"""): + enabled = "aktiv" if row["enabled"] else "pausiert" + print(f"- {row['name']}: {enabled}, last={row['last_run'] or '-'}, next={row['next_run'] or '-'}, cron=`{row['cron_expression']}`") + +print("") +print("### Dauer-Drift (Median 14 Tage)") +duration_rows = cur.execute(""" + select started_at, completed_at, + (julianday(completed_at) - julianday(started_at)) * 86400 as duration_seconds + from backup_jobs + where status = 'completed' + and started_at is not null + and completed_at is not null + and completed_at >= datetime('now', '-14 days') + order by completed_at desc +""").fetchall() + +durations = [r["duration_seconds"] for r in duration_rows if r["duration_seconds"] and r["duration_seconds"] > 0] + +if len(durations) < 3: + print(f"- Zu wenig Datenpunkte fuer eine Drift-Bewertung (n={len(durations)}).") +else: + durations_sorted = sorted(durations) + median = durations_sorted[len(durations_sorted) // 2] + latest = durations[0] + ratio = latest / median if median > 0 else 0 + print(f"- Letzter Lauf: {fmt_sec(latest)}") + print(f"- Median 14 Tage: {fmt_sec(median)} (n={len(durations)})") + print(f"- Verhaeltnis: {ratio:.2f}x") + if ratio > 2.0: + print(f"- Bewertung: Drift erkannt - letzter Lauf {ratio:.1f}x langsamer als der Median. Quellgroesse, IO und Repo-Zustand pruefen.") + else: + print("- Bewertung: Backup-Dauer im erwarteten Bereich.") +PY + then + append "- WARNUNG: Borg-Auswertung fehlgeschlagen." + set_summary "borg_status" "unknown" + set_summary "backup_duration_drift" "unknown" + record_section_error "borg" "Python-Auswertung in $BORG_CONTAINER fehlgeschlagen" + else + local borg_out borg_status borg_drift + borg_out="$(docker exec -i "$BORG_CONTAINER" python3 - <<'PY' 2>/dev/null || true +import sqlite3 +conn = sqlite3.connect("/data/borg.db") +conn.row_factory = sqlite3.Row +cur = conn.cursor() +status_row = cur.execute(""" + select status + from backup_jobs + order by coalesce(started_at, created_at) desc + limit 1 +""").fetchone() +status = status_row[0] if status_row else "missing" + +duration_rows = cur.execute(""" + select (julianday(completed_at) - julianday(started_at)) * 86400 as ds + from backup_jobs + where status = 'completed' + and started_at is not null + and completed_at is not null + and completed_at >= datetime('now', '-14 days') + order by completed_at desc +""").fetchall() +durations = [r[0] for r in duration_rows if r[0] and r[0] > 0] +if len(durations) < 3: + drift = "insufficient" +else: + median = sorted(durations)[len(durations)//2] + latest = durations[0] + ratio = latest / median if median > 0 else 0 + drift = "1" if ratio > 2.0 else "0" + +print(f"status={status}") +print(f"drift={drift}") +PY +)" + borg_status="$(printf '%s' "$borg_out" | sed -n 's/^status=//p' | head -n 1)" + borg_drift="$(printf '%s' "$borg_out" | sed -n 's/^drift=//p' | head -n 1)" + if [ "${borg_drift:-}" = "1" ]; then + set_summary "backup_duration_drift" "1" + elif [ "${borg_drift:-}" = "0" ]; then + set_summary "backup_duration_drift" "0" + else + set_summary "backup_duration_drift" "unknown" + fi + set_summary "borg_status" "${borg_status:-unknown}" + fi + + append "" +} + +collect_prometheus() { + append "## Prometheus Alerts" + append "" + + if ! have_container "$PROMETHEUS_CONTAINER"; then + append "- WARNUNG: Container \`$PROMETHEUS_CONTAINER\` nicht gefunden." + append "" + set_summary "prometheus_alerts" "unknown" + set_summary "prometheus_alerts_firing" "unknown" + set_summary "prometheus_alerts_pending" "unknown" + record_section_error "prometheus" "Container $PROMETHEUS_CONTAINER nicht gefunden" + return + fi + + local alerts + alerts="$(docker exec "$PROMETHEUS_CONTAINER" wget -qO- http://localhost:9090/api/v1/alerts 2>/dev/null || true)" + if [ -z "$alerts" ]; then + append "- WARNUNG: Prometheus Alerts API nicht erreichbar." + set_summary "prometheus_alerts" "unknown" + set_summary "prometheus_alerts_firing" "unknown" + set_summary "prometheus_alerts_pending" "unknown" + record_section_error "prometheus" "Alerts-API leer oder nicht erreichbar" + elif printf '%s' "$alerts" | grep -q '"alerts":\[\]'; then + append "- Keine aktiven Alerts." + set_summary "prometheus_alerts" "0" + set_summary "prometheus_alerts_firing" "0" + set_summary "prometheus_alerts_pending" "0" + else + local total firing pending + total="$(printf '%s' "$alerts" | grep -o '"alertname":"[^"]*"' | count_lines)" + firing="$(printf '%s' "$alerts" | grep -o '"state":"firing"' | count_lines)" + pending="$(printf '%s' "$alerts" | grep -o '"state":"pending"' | count_lines)" + append "- Aktive Alerts insgesamt: $total" + append "- Davon firing: $firing" + append "- Davon pending: $pending" + append "" + append "### Details" + printf '%s' "$alerts" \ + | grep -o '"alertname":"[^"]*"\|"severity":"[^"]*"\|"instance":"[^"]*"\|"service":"[^"]*"\|"state":"[^"]*"' \ + | sed 's/^/ - /' >> "$BODY_PATH" + set_summary "prometheus_alerts" "$total" + set_summary "prometheus_alerts_firing" "$firing" + set_summary "prometheus_alerts_pending" "$pending" + fi + append "" +} + +collect_certificate_health() { + append "## Zertifikate" + append "" + + local cert_file="$TMP_DIR/certificates.tsv" + local cert_sorted="$TMP_DIR/certificates.sorted.tsv" + local warning_count=0 + local total_count=0 + : > "$cert_file" + + if [ ! -f "$TRAEFIK_ACME_PATH" ]; then + append "- WARNUNG: Traefik ACME-Datei nicht gefunden: $TRAEFIK_ACME_PATH" + set_summary "cert_warnings" "1" + record_section_error "certificates" "ACME-Datei $TRAEFIK_ACME_PATH fehlt" + append "" + return + fi + + if docker run -i --rm \ + -v "$TRAEFIK_ACME_PATH:/acme.json:ro" \ + python:3.13-alpine python - <<'PY' > "$cert_file" +import base64 +import json +import ssl +import tempfile +from datetime import datetime, timezone + +with open("/acme.json", "r", encoding="utf-8") as handle: + data = json.load(handle) + +now = datetime.now(timezone.utc) +for resolver in data.values(): + for cert in resolver.get("Certificates", []): + domain = cert.get("domain", {}).get("main") or "-" + sans = cert.get("domain", {}).get("sans") or [] + cert_b64 = cert.get("certificate") + if not cert_b64: + continue + pem = base64.b64decode(cert_b64) + with tempfile.NamedTemporaryFile(delete=False) as tmp: + tmp.write(pem) + tmp_path = tmp.name + decoded = ssl._ssl._test_decode_cert(tmp_path) + not_after = datetime.strptime(decoded["notAfter"], "%b %d %H:%M:%S %Y %Z").replace(tzinfo=timezone.utc) + days = (not_after - now).days + names = ", ".join([domain, *sans]) + print(f"{days}\t{not_after.date().isoformat()}\t{names}") +PY + then + if [ ! -s "$cert_file" ]; then + append "- WARNUNG: Keine Zertifikate in ACME-Datei gefunden." + warning_count=1 + record_section_error "certificates" "ACME-Datei enthielt keine Zertifikate" + else + sort -n "$cert_file" > "$cert_sorted" + total_count="$(count_lines < "$cert_sorted")" + append "- Zertifikate gesamt: $total_count" + append "- Anzeige: die $CERT_MAX_ROWS Zertifikate mit der kuerzesten Restlaufzeit" + append "- Schwelle Warnung: weniger als $CERT_WARN_DAYS Tage" + append "" + append "| Resttage | Ablaufdatum UTC | Domains |" + append "|---:|---|---|" + while IFS="$(printf '\t')" read -r days expires domains; do + append "| $days | $expires | $domains |" + if [ "${days:-0}" -lt "$CERT_WARN_DAYS" ]; then + warning_count=$((warning_count + 1)) + fi + done < <(head -n "$CERT_MAX_ROWS" "$cert_sorted") + while IFS="$(printf '\t')" read -r days _expires _domains; do + if [ "${days:-0}" -lt "$CERT_WARN_DAYS" ]; then + warning_count=$((warning_count + 1)) + fi + done < <(tail -n +"$((CERT_MAX_ROWS + 1))" "$cert_sorted") + append "" + if [ "$warning_count" -eq 0 ]; then + append "Bewertung: Keine Zertifikate im kritischen Erneuerungsfenster unter $CERT_WARN_DAYS Tagen." + else + append "Bewertung: $warning_count Zertifikat(e) laufen in weniger als $CERT_WARN_DAYS Tagen ab und sollten beobachtet werden." + fi + fi + else + append "- WARNUNG: Zertifikate konnten nicht aus ACME-Datei gelesen werden." + warning_count=1 + record_section_error "certificates" "Auswertung der ACME-Datei fehlgeschlagen" + fi + + set_summary "cert_warnings" "$warning_count" + append "" +} + +collect_disk_health() { + append "## Storage / Filesystem" + append "" + + local disk_warnings=0 + local paths="/mnt/cache /mnt/disk1 /mnt/user /mnt/user/appdata /mnt/user/backups" + + append "- Schwelle Warnung: Nutzung ab ${DISK_USAGE_WARN_PCT}%" + append "" + append "| Pfad | Filesystem | Nutzung | Frei | Bewertung |" + append "|---|---|---:|---:|---|" + + for path in $paths; do + if [ ! -e "$path" ]; then + append "| $path | - | - | - | fehlt |" + disk_warnings=$((disk_warnings + 1)) + record_section_error "disk" "Kernpfad $path fehlt" + continue + fi + + local fstype usage avail verdict + fstype="$(findmnt -T "$path" -no FSTYPE 2>/dev/null | head -n 1 || true)" + usage="$(df -P "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')" + avail="$(df -hP "$path" 2>/dev/null | awk 'NR==2 { print $4 }')" + verdict="ok" + + if ! printf '%s' "${usage:-}" | grep -Eq '^[0-9]+$'; then + usage="-" + verdict="unbekannt" + disk_warnings=$((disk_warnings + 1)) + elif [ "$usage" -ge "$DISK_USAGE_WARN_PCT" ]; then + verdict="Warnung: >=${DISK_USAGE_WARN_PCT}%" + disk_warnings=$((disk_warnings + 1)) + fi + + append "| $path | ${fstype:-unbekannt} | ${usage}% | ${avail:-?} | $verdict |" + done + + append "" + if [ "$disk_warnings" -eq 0 ]; then + append "Bewertung: Keine kritischen Fuellstaende oder fehlenden Kernpfade erkannt." + else + append "Bewertung: $disk_warnings Storage-/Filesystem-Punkt(e) brauchen Aufmerksamkeit." + fi + + set_summary "disk_warnings" "$disk_warnings" + append "" +} + +collect_image_freshness() { + append "## Image-Aktualitaet" + append "" + + local image_file="$TMP_DIR/images.tsv" + local image_warnings=0 + local now_epoch + : > "$image_file" + now_epoch="$(date +%s)" + + while IFS= read -r name; do + [ -n "$name" ] || continue + local image_id created_iso created_epoch age_days image_tag + image_id="$(docker inspect --format '{{.Image}}' "$name" 2>/dev/null || true)" + [ -n "$image_id" ] || continue + created_iso="$(docker image inspect --format '{{.Created}}' "$image_id" 2>/dev/null || true)" + image_tag="$(docker inspect --format '{{.Config.Image}}' "$name" 2>/dev/null || echo '?')" + [ -n "$created_iso" ] || continue + created_epoch="$(date -d "$created_iso" +%s 2>/dev/null || echo 0)" + [ "$created_epoch" -gt 0 ] || continue + age_days=$(( (now_epoch - created_epoch) / 86400 )) + printf '%d\t%s\t%s\n' "$age_days" "$name" "$image_tag" >> "$image_file" + if [ "$age_days" -ge "$IMAGE_AGE_WARN_DAYS" ]; then + image_warnings=$((image_warnings + 1)) + fi + done < <(docker ps --format '{{.Names}}') + + set_summary "image_warnings" "$image_warnings" + + if [ ! -s "$image_file" ]; then + append "- Keine Image-Daten verfuegbar." + record_section_error "images" "Keine Image-Daten ermittelt" + else + append "- Schwelle Warnung: Image aelter als $IMAGE_AGE_WARN_DAYS Tage" + append "- Container mit Image >= $IMAGE_AGE_WARN_DAYS Tage: $image_warnings" + append "" + append "### Aelteste Images (Top 10)" + append "" + append "| Alter Tage | Container | Image |" + append "|---:|---|---|" + sort -nr "$image_file" | head -n 10 | while IFS="$(printf '\t')" read -r age name img; do + append "| $age | $name | $img |" + done + append "" + if [ "$image_warnings" -eq 0 ]; then + append "Bewertung: Keine Container mit ueberalterten Images. CVE-Hygiene aus dieser Sicht ok." + else + append "Bewertung: $image_warnings Container nutzen Images aelter als $IMAGE_AGE_WARN_DAYS Tage. Update-Pipeline und CVE-Status pruefen." + fi + fi + append "" +} + +collect_container_events() { + append "## Docker Events ($SINCE)" + append "" + + local events_file="$TMP_DIR/docker-events.log" + timeout 20 docker events \ + --since "$SINCE" \ + --until "$(date -Iseconds)" \ + --filter event=die \ + --filter event=oom \ + --filter event=kill \ + --filter event=restart \ + --format '{{.Time}}|{{.Actor.Attributes.name}}|{{.Action}}|{{.Actor.Attributes.exitCode}}|{{.Actor.Attributes.image}}' \ + | awk -F '|' '!(($3 == "die") && ($4 == "0")) { print }' \ + > "$events_file" 2>/dev/null || true + + local event_count + event_count="$(count_lines < "$events_file")" + set_summary "docker_events" "$event_count" + + if [ "$event_count" -eq 0 ]; then + append '- Keine `die`/`oom`/`kill`/`restart` Events im Zeitraum.' + else + append "- Relevante Events: $event_count" + append "" + append '```text' + tail -n 80 "$events_file" >> "$BODY_PATH" + append '```' + fi + append "" +} + +collect_container_state() { + append "## Container-Zustand" + append "" + append "### Nicht laufende Container" + local stopped_file="$TMP_DIR/stopped.log" + docker ps -a --filter status=exited --filter status=dead --filter status=created --format '{{.Names}}\t{{.Status}}' > "$stopped_file" + if [ ! -s "$stopped_file" ]; then + append "- Keine." + else + append '```text' + cat "$stopped_file" >> "$BODY_PATH" + append '```' + fi + append "" + + append "### Container mit RestartCount > 0" + local restart_file="$TMP_DIR/restarts.log" + : > "$restart_file" + while IFS= read -r name; do + [ -n "$name" ] || continue + local count + count="$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo 0)" + if [ "${count:-0}" -gt 0 ]; then + printf '%s\t%s\n' "$name" "$count" >> "$restart_file" + fi + done < <(docker ps -a --format '{{.Names}}') + + if [ ! -s "$restart_file" ]; then + append "- Keine." + else + append '```text' + sort -k2,2nr "$restart_file" >> "$BODY_PATH" + append '```' + fi + append "" +} + +collect_traefik_5xx() { + append "## Traefik 5xx ($SINCE)" + append "" + + if ! have_container traefik; then + append "- Traefik-Container nicht gefunden." + append "" + record_section_error "traefik" "Container traefik nicht gefunden" + return + fi + + local file="$TMP_DIR/traefik-5xx.log" + docker logs --since "$SINCE" traefik 2>&1 \ + | awk '$9 ~ /^5[0-9][0-9]$/ { print }' \ + > "$file" || true + + local count + count="$(count_lines < "$file")" + set_summary "traefik_5xx" "$count" + + if [ "$count" -eq 0 ]; then + append "- Keine 5xx-Antworten." + else + append "- 5xx-Antworten: $count" + append "" + append "### Gruppiert nach Service/Code" + append '```text' + awk '{ code=$9; service=$12; gsub(/"/, "", service); counts[service " " code]++ } END { for (k in counts) print counts[k], k }' "$file" | sort -nr >> "$BODY_PATH" + append '```' + append "" + append "### Letzte Zeilen" + append '```text' + tail -n "$MAX_LOG_LINES" "$file" >> "$BODY_PATH" + append '```' + fi + append "" +} + +collect_log_highlights() { + append "## Log-Auswertung ($SINCE)" + append "" + append "Ziel dieses Abschnitts ist nicht, Rohlogs zu wiederholen, sondern handlungsrelevante Auffaelligkeiten auszusortieren." + append "" + + local hits="$TMP_DIR/log-hits.log" + local attention="$TMP_DIR/log-attention.log" + local known_noise="$TMP_DIR/log-known-noise.log" + : > "$hits" + : > "$attention" + : > "$known_noise" + + while IFS= read -r name; do + [ -n "$name" ] || continue + docker logs --since "$SINCE" "$name" 2>&1 \ + | grep -Eai 'error|fatal|panic|exception|failed|denied|unauthorized|forbidden|oom' \ + | grep -Eavi 'level=info|levelname.: .INFO| 200 OK| 404 Not Found|healthcheck|probe_success' \ + | grep -Eavi 'production.DEBUG|stats_refresh_scheduler.*errors.: 0|Sync completed.*Failed: 0' \ + | sed -E 's/(refresh_token: )[A-Za-z0-9._-]+/\1[REDACTED]/Ig; s/(token: )[A-Za-z0-9._-]+/\1[REDACTED]/Ig; s/(Authorization: )[A-Za-z0-9._ -]+/\1[REDACTED]/Ig' \ + | sed "s/^/[$name] /" >> "$hits" || true + done < <(docker ps --format '{{.Names}}') + + # Normalize the noise pattern file (drop comments, empty lines, trim + # whitespace). An empty or whitespace-only pattern line would otherwise + # make grep -Eaif match every hit and silently wipe the log highlights. + local noise_normalized="$TMP_DIR/noise.patterns.normalized" + : > "$noise_normalized" + if [ -f "$NOISE_PATTERNS_FILE" ]; then + if [ -x "$NORMALIZE_NOISE_SCRIPT" ]; then + "$NORMALIZE_NOISE_SCRIPT" "$NOISE_PATTERNS_FILE" > "$noise_normalized" 2>/dev/null || : > "$noise_normalized" + else + record_section_error "log-highlights" "Normalize-Helper fehlt oder nicht ausfuehrbar: $NORMALIZE_NOISE_SCRIPT - Noise-Patterns ungenormt verwendet" + # Fallback inline (same logic as the helper) so we still avoid the + # "empty line matches all" trap. + grep -Ev '^[[:space:]]*(#|$)' "$NOISE_PATTERNS_FILE" 2>/dev/null \ + | sed -E 's/^[[:space:]]+//; s/[[:space:]]+$//' \ + | grep -v '^$' > "$noise_normalized" || : > "$noise_normalized" + fi + else + record_section_error "log-highlights" "Noise-Pattern-Datei $NOISE_PATTERNS_FILE fehlt - alle Treffer gelten als handlungsrelevant" + fi + + if [ -s "$hits" ]; then + if [ -s "$noise_normalized" ]; then + grep -Eaif "$noise_normalized" "$hits" > "$known_noise" || true + fi + if [ -s "$known_noise" ]; then + # Normalisierung gegen abweichende Whitespace-Enden + sed -E 's/[[:space:]]+$//' "$known_noise" > "$known_noise.norm" + sed -E 's/[[:space:]]+$//' "$hits" > "$hits.norm" + grep -Fvxf "$known_noise.norm" "$hits.norm" > "$attention" || true + else + cp "$hits" "$attention" + fi + fi + + # Per-container noise breakdown (always computed, even if SHOW_KNOWN_NOISE=0). + local noise_by_container="$TMP_DIR/noise-by-container.tsv" + : > "$noise_by_container" + if [ -s "$known_noise" ]; then + awk -F '[][]' '{ counts[$2]++ } END { for (n in counts) print counts[n] "\t" n }' "$known_noise" \ + | sort -nr > "$noise_by_container" + fi + + # Per-pattern noise breakdown: count how often each pattern hit in $hits. + # Note: a single hit line may match multiple patterns; counts can overlap. + local noise_by_pattern="$TMP_DIR/noise-by-pattern.tsv" + : > "$noise_by_pattern" + if [ -s "$noise_normalized" ] && [ -s "$hits" ]; then + while IFS= read -r p; do + [ -n "$p" ] || continue + local pcount + pcount="$(grep -Eaic -- "$p" "$hits" 2>/dev/null || echo 0)" + if [ "${pcount:-0}" -gt 0 ]; then + printf '%d\t%s\n' "$pcount" "$p" >> "$noise_by_pattern" + fi + done < "$noise_normalized" + if [ -s "$noise_by_pattern" ]; then + sort -nr -o "$noise_by_pattern" "$noise_by_pattern" + fi + fi + + # Threshold escalation: how many patterns produced more than the threshold? + local noise_threshold_exceeded=0 + if [ -s "$noise_by_pattern" ]; then + noise_threshold_exceeded="$(awk -v t="$NOISE_ESCALATION_THRESHOLD" '$1 > t { n++ } END { print n + 0 }' "$noise_by_pattern")" + fi + set_summary "noise_threshold_exceeded" "$noise_threshold_exceeded" + + local hit_count attention_count known_noise_count + hit_count="$(count_lines < "$hits")" + attention_count="$(count_lines < "$attention")" + known_noise_count="$(count_lines < "$known_noise")" + set_summary "log_highlights" "$attention_count" + set_summary "log_hits_total" "$hit_count" + set_summary "log_known_noise" "$known_noise_count" + + if [ "$hit_count" -eq 0 ]; then + append "- Keine auffaelligen Logmuster gefunden." + else + append "- Gefundene Logmuster insgesamt: $hit_count" + append "- Davon als bekanntes Rauschen eingeordnet: $known_noise_count" + append "- Handlungsrelevante Logmuster: $attention_count" + append "- Noise-Pattern-Quelle: \`$NOISE_PATTERNS_FILE\`" + append "- Eskalations-Schwelle pro Pattern: $NOISE_ESCALATION_THRESHOLD" + if [ "$noise_threshold_exceeded" -gt 0 ]; then + append "- WARNUNG: $noise_threshold_exceeded Pattern ueberschreit(en) die Schwelle - bitte pruefen ob noch wirklich Noise." + fi + append "" + + if [ "$attention_count" -eq 0 ]; then + append "Bewertung: Keine handlungsrelevanten Logmuster. Die Treffer bestehen aus bekannten, aktuell nicht kritischen Meldungen." + else + append "Bewertung: Es gibt Logmuster, die nicht automatisch als bekanntes Rauschen eingeordnet wurden. Diese sollten geprueft werden." + append "" + append "### Betroffene Container" + append "" + append "| Container | Anzahl |" + append "|---|---:|" + awk -F '[][]' '{ counts[$2]++ } END { for (name in counts) print "| " name " | " counts[name] " |" }' "$attention" | sort >> "$BODY_PATH" + append "" + append "### Beispiele" + append "" + append '```text' + awk -F '[][]' ' + { + name=$2 + if (seen[name] < 3) { + line=$0 + gsub(/[[:space:]]+/, " ", line) + if (length(line) > 220) line=substr(line, 1, 217) "..." + print line + seen[name]++ + } + } + ' "$attention" | head -n "$MAX_LOG_LINES" >> "$BODY_PATH" + append '```' + fi + + if [ "$known_noise_count" -gt 0 ]; then + append "" + append "### Bekanntes Rauschen (Top)" + append "" + if [ -s "$noise_by_container" ]; then + append "#### Container mit den meisten Noise-Treffern" + append "" + append "| Container | Anzahl |" + append "|---|---:|" + head -n "$NOISE_BREAKDOWN_TOP_N" "$noise_by_container" \ + | while IFS="$(printf '\t')" read -r cnt cname; do + append "| ${cname:-?} | $cnt |" + done + append "" + fi + if [ -s "$noise_by_pattern" ]; then + append "#### Pattern mit den meisten Treffern" + append "" + append "| Pattern | Anzahl |" + append "|---|---:|" + head -n "$NOISE_BREAKDOWN_TOP_N" "$noise_by_pattern" \ + | while IFS="$(printf '\t')" read -r cnt pat; do + local short="$pat" + if [ "${#short}" -gt 80 ]; then + short="${short:0:77}..." + fi + # Escape pipe characters that would break the markdown table. + short="${short//|/\\|}" + append "| \`$short\` | $cnt |" + done + append "" + fi + if [ "$noise_threshold_exceeded" -gt 0 ]; then + append "Bewertung: $noise_threshold_exceeded Pattern ueberschreit(en) die Eskalations-Schwelle ($NOISE_ESCALATION_THRESHOLD). Bitte pruefen, ob die als Noise eingeordneten Meldungen noch fachlich Noise sind oder ob sich ein echter Vorfall darunter versteckt." + else + append "Bewertung: Kein Pattern ueberschreitet die Eskalations-Schwelle ($NOISE_ESCALATION_THRESHOLD)." + fi + fi + + if [ "$known_noise_count" -gt 0 ] && [ "$SHOW_KNOWN_NOISE" = "1" ]; then + append "" + append "### Ausgeblendetes bekanntes Rauschen (Top 50 Zeilen)" + append "" + append '```text' + head -n 50 "$known_noise" >> "$BODY_PATH" + append '```' + fi + fi + append "" +} + +collect_log_volume() { + append "## Log-Volumen ($SINCE)" + append "" + + local volume_file="$TMP_DIR/log-volume.tsv" + : > "$volume_file" + + while IFS= read -r name; do + [ -n "$name" ] || continue + local count + count="$(docker logs --since "$SINCE" "$name" 2>&1 | count_lines)" + printf '%d\t%s\n' "$count" "$name" >> "$volume_file" + done < <(docker ps --format '{{.Names}}') + + local total + total="$(awk '{ s += $1 } END { print s + 0 }' "$volume_file")" + set_summary "log_volume_total" "$total" + + if [ "$total" -eq 0 ]; then + append "- Keine Logzeilen im Zeitraum (unwahrscheinlich, evtl. Datenquelle pruefen)." + record_section_error "log-volume" "Log-Volumen ueber alle Container ist 0" + else + append "- Zeilen insgesamt im Zeitraum: $total" + append "" + append "### Top $LOG_VOLUME_TOP_N lauteste Container" + append "" + append "| Container | Zeilen |" + append "|---|---:|" + sort -nr "$volume_file" | head -n "$LOG_VOLUME_TOP_N" | while IFS="$(printf '\t')" read -r c n; do + append "| $n | $c |" + done + append "" + append "Bewertung: Auffaellig laute Container sind oft ein Frueh-Indikator fuer Endlosschleifen, schlecht konfigurierte Loglevel oder Probe-Spam." + fi + append "" +} + +collect_diff_yesterday() { + append "## Vergleich mit gestern" + append "" + + local yesterday yesterday_summary + yesterday="$(date -d 'yesterday' +%F 2>/dev/null || true)" + yesterday_summary="$REPORT_DIR/summary-$yesterday.env" + + if [ -z "$yesterday" ] || [ ! -f "$yesterday_summary" ]; then + append "- Keine Vortagsdaten verfuegbar ($yesterday_summary)." + append "" + return + fi + + local prev_borg= prev_alerts= prev_firing= prev_pending= prev_unhealthy= prev_exited= prev_5xx= prev_events= prev_log= prev_certs= prev_disk= prev_img= prev_drift= prev_vol= + while IFS='=' read -r key value; do + case "$key" in + borg_status) prev_borg="$value" ;; + prometheus_alerts) prev_alerts="$value" ;; + prometheus_alerts_firing) prev_firing="$value" ;; + prometheus_alerts_pending) prev_pending="$value" ;; + containers_unhealthy) prev_unhealthy="$value" ;; + containers_exited_nonzero) prev_exited="$value" ;; + traefik_5xx) prev_5xx="$value" ;; + docker_events) prev_events="$value" ;; + log_highlights) prev_log="$value" ;; + cert_warnings) prev_certs="$value" ;; + disk_warnings) prev_disk="$value" ;; + image_warnings) prev_img="$value" ;; + backup_duration_drift) prev_drift="$value" ;; + log_volume_total) prev_vol="$value" ;; + esac + done < "$yesterday_summary" + + # shellcheck disable=SC1090 + . "$SUMMARY_PATH" + + append "Vergleich des Datums $REPORT_DATE mit $yesterday." + append "" + append "| Metrik | Heute | Gestern |" + append "|---|---:|---:|" + append "| Borg Status | ${borg_status:-?} | ${prev_borg:-?} |" + append "| Prometheus Alerts gesamt | ${prometheus_alerts:-?} | ${prev_alerts:-?} |" + append "| Prometheus firing | ${prometheus_alerts_firing:-?} | ${prev_firing:-?} |" + append "| Prometheus pending | ${prometheus_alerts_pending:-?} | ${prev_pending:-?} |" + append "| Container unhealthy | ${containers_unhealthy:-?} | ${prev_unhealthy:-?} |" + append "| Container exited non-zero | ${containers_exited_nonzero:-?} | ${prev_exited:-?} |" + append "| Docker Events | ${docker_events:-?} | ${prev_events:-?} |" + append "| Traefik 5xx | ${traefik_5xx:-?} | ${prev_5xx:-?} |" + append "| Log-Highlights | ${log_highlights:-?} | ${prev_log:-?} |" + append "| Log-Volumen | ${log_volume_total:-?} | ${prev_vol:-?} |" + append "| Zertifikatswarnungen | ${cert_warnings:-?} | ${prev_certs:-?} |" + append "| Storage-Warnungen | ${disk_warnings:-?} | ${prev_disk:-?} |" + append "| Image-Warnungen | ${image_warnings:-?} | ${prev_img:-?} |" + append "| Backup-Dauer-Drift | ${backup_duration_drift:-?} | ${prev_drift:-?} |" + append "" + + local notable=0 + if [ "${containers_exited_nonzero:-0}" != "${prev_exited:-0}" ] || \ + [ "${containers_unhealthy:-0}" != "${prev_unhealthy:-0}" ] || \ + [ "${prometheus_alerts_firing:-0}" != "${prev_firing:-0}" ] || \ + [ "${prometheus_alerts_pending:-0}" != "${prev_pending:-0}" ] || \ + [ "${log_highlights:-0}" != "${prev_log:-0}" ] || \ + [ "${borg_status:-unknown}" != "${prev_borg:-unknown}" ] || \ + [ "${backup_duration_drift:-0}" != "${prev_drift:-0}" ]; then + notable=1 + fi + + if [ "$notable" -eq 0 ]; then + append "Bewertung: Keine relevanten Aenderungen gegenueber gestern." + else + append "Bewertung: Relevante Aenderungen gegenueber gestern. Details bitte in den einzelnen Abschnitten pruefen." + fi + append "" +} + +collect_self_health() { + append "## Self-Health" + append "" + + local script_duration section_failures + script_duration=$(( $(date +%s) - SCRIPT_START )) + section_failures="$(count_lines < "$SECTION_ERRORS_FILE")" + + set_summary "script_duration_seconds" "$script_duration" + set_summary "section_failures" "$section_failures" + + append "- Skript-Laufzeit: $(format_duration "$script_duration") (${script_duration}s)" + append "- Sektionen mit Fehlern: $section_failures" + append "- Noise-Pattern-Datei vorhanden: $([ -f "$NOISE_PATTERNS_FILE" ] && echo ja || echo nein)" + append "- Lock-Datei: \`$LOCK_FILE\`" + + if [ "$section_failures" -gt 0 ]; then + append "" + append "### Fehlerhafte Sektionen" + append "" + while IFS= read -r line; do + append "- $line" + done < "$SECTION_ERRORS_FILE" + fi + append "" +} + +write_report() { + mkdir -p "$REPORT_DIR" + + # shellcheck disable=SC1090 + . "$SUMMARY_PATH" + + { + printf '# Homelab Operations Report - %s\n\n' "$REPORT_DATE" + printf '%s\n' "- Erstellt: \`$(date -Iseconds)\`" + printf '%s\n' "- Zeitraum: letzte \`$SINCE\`" + printf '%s\n' "- Host: \`$(hostname)\`" + printf '%s\n\n' "- Gesamtbewertung: \`$REPORT_STATUS\`" + printf '## Executive Summary\n\n' + if [ "$REPORT_STATUS" = "OK" ]; then + printf 'Im betrachteten Zeitraum zeigt das Homelab eine stabile Betriebslage. Das letzte Borg-Backup ist erfolgreich abgeschlossen, Prometheus meldet keine firing Alerts, keine unhealthy Container, Zertifikate und Storage im erwarteten Bereich.\n\n' + elif [ "$REPORT_STATUS" = "WARNUNG" ]; then + printf 'Im betrachteten Zeitraum gibt es Punkte, die Aufmerksamkeit verdienen. Der Betrieb ist nicht automatisch als kompromittiert zu bewerten, aber mindestens ein Signal (Backup, Pending Alert, Zertifikat, Storage, Image-Alter, Drift oder Reboot) weicht vom Normalzustand ab.\n\n' + else + printf 'Im betrachteten Zeitraum liegt ein kritisches Betriebssignal vor. Der Bericht sollte zeitnah gelesen und die betroffenen Komponenten priorisiert geprueft werden.\n\n' + fi + printf '### Management-Bewertung\n\n' + printf '%s\n' "- Status: \`$REPORT_STATUS\`" + printf '%s\n' "- Borg Backup: \`${borg_status:-unknown}\`" + printf '%s\n' "- Backup-Dauer-Drift: \`${backup_duration_drift:-unknown}\`" + printf '%s\n' "- Prometheus Alerts (gesamt/firing/pending): \`${prometheus_alerts:-unknown}\` / \`${prometheus_alerts_firing:-unknown}\` / \`${prometheus_alerts_pending:-unknown}\`" + printf '%s\n' "- Container unhealthy: \`${containers_unhealthy:-unknown}\`" + printf '%s\n' "- Container exited non-zero: \`${containers_exited_nonzero:-unknown}\`" + printf '%s\n' "- Docker Critical Events: \`${docker_events:-unknown}\`" + printf '%s\n' "- Traefik 5xx: \`${traefik_5xx:-unknown}\`" + printf '%s\n' "- Zertifikatswarnungen: \`${cert_warnings:-unknown}\`" + printf '%s\n' "- Storage-Warnungen: \`${disk_warnings:-unknown}\`" + printf '%s\n' "- Image-Warnungen: \`${image_warnings:-unknown}\`" + printf '%s\n' "- Log-Highlights: \`${log_highlights:-unknown}\`" + printf '%s\n' "- Noise-Pattern ueber Schwelle: \`${noise_threshold_exceeded:-0}\`" + printf '%s\n' "- Log-Volumen gesamt: \`${log_volume_total:-unknown}\`" + printf '%s\n' "- Reboot in letzten 24h: \`${host_recent_boot:-unknown}\`" + printf '%s\n\n' "- Sektionsfehler im Skript: \`${section_failures:-unknown}\`" + printf '### Einordnung\n\n' + printf 'Dieser Report ist ein Management-Lagebericht: Er verdichtet Backup-Status, Container-Zustand, Monitoring-Alerts, Traefik-Fehler, Zertifikate, Storage, Image-Aktualitaet, Log-Volumen und Drift-Indikatoren. Rohlogs werden nur ausschnittsweise gezeigt, damit der Bericht lesbar bleibt und trotzdem nachvollziehbar ist.\n\n' + cat "$BODY_PATH" + printf '## Schlussbewertung\n\n' + if [ "$REPORT_STATUS" = "OK" ]; then + printf 'Das Homelab war im betrachteten Zeitraum betriebsfaehig und ohne akute Warnsignale. Es besteht aus diesem Report heraus kein unmittelbarer Handlungsdruck.\n' + elif [ "$REPORT_STATUS" = "WARNUNG" ]; then + printf 'Das Homelab war grundsaetzlich betriebsfaehig, zeigt aber mindestens eine Auffaelligkeit. Die im Bericht genannten Punkte sollten geprueft und bei Wiederholung nachverfolgt werden.\n' + else + printf 'Das Homelab zeigt ein kritisches Signal. Die betroffenen Dienste, Backup-Lage und firing Alerts sollten sofort geprueft werden.\n' + fi + } > "$REPORT_PATH.tmp" + mv "$REPORT_PATH.tmp" "$REPORT_PATH" + + cp "$SUMMARY_PATH" "$PERSISTENT_SUMMARY_PATH.tmp" + mv "$PERSISTENT_SUMMARY_PATH.tmp" "$PERSISTENT_SUMMARY_PATH" +} + +send_report_mail() { + [ "$SEND_MAIL" = "1" ] || return 0 + [ -x "$MAIL_SCRIPT" ] || { + echo "Mail script missing or not executable: $MAIL_SCRIPT" >&2 + record_section_error "mail" "Mail-Skript $MAIL_SCRIPT fehlt oder nicht ausfuehrbar" + return 1 + } + + case "$MAIL_MODE:$REPORT_STATUS" in + always:*|warning:WARNUNG|warning:KRITISCH|critical:KRITISCH) + "$MAIL_SCRIPT" "$REPORT_PATH" "$REPORT_STATUS" + ;; + always:*|warning:*|critical:*) + # Mode bekannt, aber Status loest keinen Versand aus + ;; + *) + echo "Unknown MAIL_MODE '$MAIL_MODE' - mail not sent. Use always|warning|critical." >&2 + record_section_error "mail" "Unbekanntes MAIL_MODE '$MAIL_MODE'" + return 1 + ;; + esac +} + +send_summary_ntfy() { + [ "$SEND_NTFY" = "1" ] || return 0 + [ -x "$NTFY_SCRIPT" ] || return 0 + + # shellcheck disable=SC1090 + . "$SUMMARY_PATH" + + local title="Homelab Tagesprotokoll: ${REPORT_STATUS:-unknown} / borg=${borg_status:-unknown}" + local priority="default" + local body="Report: $REPORT_PATH +Status: $REPORT_STATUS +Container: ${containers_running:-?}/${containers_total:-?} running, unhealthy=${containers_unhealthy:-?}, exited_nonzero=${containers_exited_nonzero:-?} +Borg: ${borg_status:-unknown} (drift=${backup_duration_drift:-unknown}) +Prometheus alerts (total/firing/pending): ${prometheus_alerts:-unknown}/${prometheus_alerts_firing:-unknown}/${prometheus_alerts_pending:-unknown} +Docker events: ${docker_events:-unknown} +Traefik 5xx: ${traefik_5xx:-unknown} +Certs warn: ${cert_warnings:-unknown} +Disk warn: ${disk_warnings:-unknown} +Image warn: ${image_warnings:-unknown} +Log highlights: ${log_highlights:-unknown} +Log volume: ${log_volume_total:-unknown} +Recent boot: ${host_recent_boot:-unknown} +Section errors: ${section_failures:-unknown}" + + case "$REPORT_STATUS" in + KRITISCH) priority="urgent" ;; + WARNUNG) priority="high" ;; + *) priority="default" ;; + esac + + "$NTFY_SCRIPT" "$NTFY_TOPIC" "$title" "$body" "$priority" || true +} + +main() { + collect_overview + collect_host_health + collect_borg + collect_prometheus + collect_certificate_health + collect_disk_health + collect_image_freshness + collect_container_events + collect_container_state + collect_traefik_5xx + collect_log_highlights + collect_log_volume + collect_diff_yesterday + derive_report_status + collect_self_health + write_report + send_report_mail + send_summary_ntfy + + printf '%s\n' "$REPORT_PATH" +} + +main "$@" + +case "$REPORT_STATUS" in + KRITISCH) exit 2 ;; + WARNUNG) exit 1 ;; + *) exit 0 ;; +esac diff --git a/services/posture-check/lib/normalize-noise-patterns.sh b/services/posture-check/lib/normalize-noise-patterns.sh new file mode 100644 index 0000000..202a7ba --- /dev/null +++ b/services/posture-check/lib/normalize-noise-patterns.sh @@ -0,0 +1,27 @@ +#!/usr/bin/env bash +# normalize-noise-patterns.sh +# +# Read a log-noise.patterns file and emit a normalized stream of patterns +# that is safe to feed into `grep -Eaif`. +# +# Behaviour: +# - Lines starting with `#` (after optional leading whitespace) are dropped. +# - Empty / whitespace-only lines are dropped. +# - Leading and trailing whitespace is trimmed from each pattern. +# - Patterns that become empty after trimming are dropped. +# +# Why this exists: +# A single empty / whitespace-only line in the input file would make +# `grep -Eaif` match every input line, silently wiping the entire log +# highlights signal. Always pipe patterns through this normalizer first. +# +# Usage: +# normalize-noise-patterns.sh +# cat patterns | normalize-noise-patterns.sh +set -euo pipefail + +src="${1:-/dev/stdin}" + +grep -Ev '^[[:space:]]*(#|$)' "$src" \ + | sed -E 's/^[[:space:]]+//; s/[[:space:]]+$//' \ + | grep -v '^$' || true diff --git a/services/posture-check/log-noise.patterns b/services/posture-check/log-noise.patterns new file mode 100644 index 0000000..b75e486 --- /dev/null +++ b/services/posture-check/log-noise.patterns @@ -0,0 +1,81 @@ +# log-noise.patterns - Daily Operations Report +# +# Format: +# - One Extended Regex (ERE) per non-comment line. +# - Lines starting with '#' (after optional whitespace) are comments. +# - Empty / whitespace-only lines are ignored. +# - Patterns are applied case-insensitively (grep -Eaif). +# - The file is normalized via lib/normalize-noise-patterns.sh before use. +# +# Per pattern, document: +# - Why this is noise (root cause, not just "expected"). +# - When to re-check / what would invalidate the assumption. +# +# Adding a new pattern: prefer the narrowest container.* prefix and the +# narrowest message anchor. A pattern that matches across containers or +# matches generic error strings will hide real signal. +# +# Removing a pattern: replace with a fresh attention example in the next +# daily report and consult before reintroducing. +# +# Last reviewed: 2026-05-21 + +# Loki internal query cancellations / scheduler chatter. +# Why: Loki cancels internal queries continuously when downstream Promtails +# or Grafana panels drop connections; no user-visible outage by itself. +# Re-check: if Grafana dashboards show real Loki query failures or if +# Prometheus alerts fire on Loki ingestion / availability. +monitoring-loki.*(context canceled|error notifying scheduler|closing iterator) + +# node-exporter parsing /host/proc/mdstat on Unraid. +# Why: Unraid uses its own array driver, not Linux mdadm, so /proc/mdstat +# layout is unparsable for node-exporter. Pure collector noise. +# Re-check: only if migrating to mdadm-based RAID. Then remove this entry +# and act on real mdadm errors. +monitoring-node-exporter.*mdadm.*Cannot parse /host/proc/mdstat + +# Gitea OpenID login attempts return 403. +# Why: OpenID provider is intentionally disabled in Gitea config; 403 is +# the expected response for stale OAuth callback URLs. +# Re-check: when OpenID/OIDC gets enabled again. Remove this and treat +# the 403 as a real auth failure signal. +gitea.*user/login/openid.*403 Forbidden + +# Uptime Kuma monitor for legacy domain grafana.kaleschke.info returning 404. +# Why: Monitor was not removed when the public Grafana endpoint was +# decommissioned. +# Re-check: at next Uptime-Kuma cleanup. Action: delete the obsolete monitor +# and remove this pattern. +uptime-kuma.*grafana\.kaleschke\.info.*status code 404 + +# Tailscale PCP port mapping failure (NAT-PMP unsupported by router). +# Why: Tailscale falls back to STUN/DERP transparently; no functional impact. +# Re-check: if Tailscale reports persistent connectivity problems in real +# usage, or if a router change adds NAT-PMP support. +Tailscale-Docker.*failed to get PCP mapping + +# Immich version check failed to reach GitHub releases API. +# Why: External GitHub release check; transient failures do not affect +# Immich core functionality. +# Re-check: if Immich UI persistently warns about being outdated or if +# security updates are missed because of this. +immich_server.*Failed to fetch latest release + +# Authelia 408 client-side request timeouts. +# Why: Clients (browsers, Vaultwarden-CLI etc.) drop slow connections; +# without correlated login failures or 5xx, individual 408s are normal. +# Re-check: if 408-rate spikes (>5/min sustained) or if login flows complain. +# Then narrow this pattern instead of removing. +authelia.*Request timeout occurred.*status_code=408 + +# Vaultwarden expired sessions and invalid refresh tokens (auth/session class). +# Why: Normal session expiry; clients retry and re-login transparently. +# Re-check: if many distinct external IPs trigger 401s in a short window +# (possible brute-force or credential-stuffing pattern). +# +# NOTE: DNS / Connect / Resolve / reqwest / hyper-client errors are +# intentionally NOT suppressed here. They are real network signals +# and should be visible in the attention list. If push-notification +# noise becomes overwhelming, add a *narrow* pattern restricted to +# push contexts only (e.g. `vaultwarden.*push.*(ResolveError|...)`). +vaultwarden.*(Token has expired|Invalid refresh token|Failed to decode.*refresh_token|POST /identity/connect/token => 401 Unauthorized) diff --git a/services/posture-check/send-operations-report-mail.sh b/services/posture-check/send-operations-report-mail.sh new file mode 100644 index 0000000..0eea470 --- /dev/null +++ b/services/posture-check/send-operations-report-mail.sh @@ -0,0 +1,612 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPORT_PATH="${1:-}" +REPORT_STATUS="${2:-UNKNOWN}" + +MAIL_FROM="${MAIL_FROM:-michideheld@gmx.de}" +MAIL_TO="${MAIL_TO:-Mi.Kaleschke@gmx.de}" +SMTP_HOST="${SMTP_HOST:-smtp.gmx.net}" +SMTP_PORT="${SMTP_PORT:-587}" +SMTP_USER="${SMTP_USER:-$MAIL_FROM}" +SMTP_PASS_FILE="${SMTP_PASS_FILE:-/mnt/user/appdata/secrets/homelab_smtp_password.txt}" +MAIL_IMAGE="${MAIL_IMAGE:-python:3.13-alpine}" +MAIL_DNS_1="${MAIL_DNS_1:-1.1.1.1}" +MAIL_DNS_2="${MAIL_DNS_2:-8.8.8.8}" + +if [ -z "$REPORT_PATH" ] || [ ! -f "$REPORT_PATH" ]; then + echo "Usage: $0 [status]" >&2 + exit 1 +fi + +if [ ! -f "$SMTP_PASS_FILE" ]; then + echo "Missing SMTP password file: $SMTP_PASS_FILE" >&2 + exit 1 +fi + +REPORT_BASENAME="$(basename "$REPORT_PATH")" +REPORT_DATE="${REPORT_BASENAME#homelab-day-}" +REPORT_DATE="${REPORT_DATE%.md}" +SUBJECT="${MAIL_SUBJECT:-Homelab Operations Report - $REPORT_DATE - $REPORT_STATUS}" + +docker run -i --rm \ + --dns "$MAIL_DNS_1" \ + --dns "$MAIL_DNS_2" \ + -e MAIL_FROM="$MAIL_FROM" \ + -e MAIL_TO="$MAIL_TO" \ + -e SMTP_HOST="$SMTP_HOST" \ + -e SMTP_PORT="$SMTP_PORT" \ + -e SMTP_USER="$SMTP_USER" \ + -e MAIL_SUBJECT="$SUBJECT" \ + -e REPORT_STATUS="$REPORT_STATUS" \ + -e REPORT_HOSTNAME="$(hostname)" \ + -v "$REPORT_PATH:/report.md:ro" \ + -v "$SMTP_PASS_FILE:/smtp-password:ro" \ + "$MAIL_IMAGE" python - <<'PY' +import os +import html +import re +import smtplib +import ssl +from datetime import datetime, timezone +from email.message import EmailMessage +from pathlib import Path + +mail_from = os.environ["MAIL_FROM"] +mail_to = os.environ["MAIL_TO"] +smtp_host = os.environ["SMTP_HOST"] +smtp_port = int(os.environ.get("SMTP_PORT", "587")) +smtp_user = os.environ.get("SMTP_USER") or mail_from +subject = os.environ["MAIL_SUBJECT"] +report_status = os.environ.get("REPORT_STATUS", "UNKNOWN") +report_hostname = os.environ.get("REPORT_HOSTNAME", "") + +password = Path("/smtp-password").read_text(encoding="utf-8").strip() +report = Path("/report.md").read_text(encoding="utf-8") + +# ---------- Style constants ---------- + +COLORS = { + "bg": "#f7f8fa", + "card_bg": "#ffffff", + "text": "#0f172a", + "text_muted": "#475569", + "border": "#e2e8f0", + "border_strong": "#cbd5e1", + "zebra": "#f1f5f9", + "code_bg": "#eef2ff", + "code_text": "#3730a3", + "pre_bg": "#f8fafc", + "accent": "#3b82f6", +} + +STATUS_THEMES = { + "OK": {"banner_a": "#16a34a", "banner_b": "#22c55e", "card_bg": "#dcfce7", "card_border": "#86efac", "card_text": "#166534"}, + "WARNUNG": {"banner_a": "#d97706", "banner_b": "#f59e0b", "card_bg": "#fef3c7", "card_border": "#fcd34d", "card_text": "#92400e"}, + "KRITISCH": {"banner_a": "#dc2626", "banner_b": "#ef4444", "card_bg": "#fee2e2", "card_border": "#fca5a5", "card_text": "#991b1b"}, + "UNKNOWN": {"banner_a": "#475569", "banner_b": "#64748b", "card_bg": "#f1f5f9", "card_border": "#cbd5e1", "card_text": "#334155"}, +} + +OK_VALUES = {"ok", "completed", "0", "aktiv", "ja"} +WARN_VALUES = {"warnung", "pending", "insufficient"} +CRIT_VALUES = {"kritisch", "failed", "error"} +UNKNOWN_VALUES = {"unknown", "missing"} +CRIT_LABEL_HINTS = ("unhealthy", "firing") +META_LABELS = {"Erstellt", "Zeitraum", "Host", "Gesamtbewertung"} + +SUMMARY_LIST_RE = re.compile(r"^-\s+(.+?):\s*`(.+?)`\s*$") +H1_DATE_RE = re.compile(r"^#\s+Homelab Operations Report\s*-\s*(\d{4}-\d{2}-\d{2})") +INLINE_CODE_RE = re.compile(r"`([^`]+)`") + + +def classify(label, value): + v = value.strip().lower() + lbl = label.strip().lower() + if v in OK_VALUES: + return "OK" + if v in UNKNOWN_VALUES: + return "UNKNOWN" + if v in WARN_VALUES: + return "WARNUNG" + if v in CRIT_VALUES: + return "KRITISCH" + try: + n = float(v) + except ValueError: + return "UNKNOWN" + if n == 0: + return "OK" + if any(hint in lbl for hint in CRIT_LABEL_HINTS): + return "KRITISCH" + return "WARNUNG" + + +def classify_callout(text): + lower = text.lower() + if "kritisch" in lower or "sofort" in lower: + return "crit" + warn_hints = ("drift", "warnung", "ablauf", "brauchen aufmerksamkeit", "pruefen", "prüfen", "ueberalter", "nachverfolgt") + if any(h in lower for h in warn_hints): + return "warn" + return "ok" + + +# ---------- Pass 1: parse_blocks ---------- + +def parse_blocks(text): + lines = text.splitlines() + blocks = [] + meta = {} + report_date = None + in_management_section = False + seen_h2 = False + i = 0 + n = len(lines) + + def flush_paragraph(buf): + if not buf: + return + joined = " ".join(buf).strip() + if not joined: + return + if joined.startswith("Bewertung:"): + blocks.append(("callout", classify_callout(joined), joined)) + else: + blocks.append(("paragraph", joined)) + + while i < n: + line = lines[i] + stripped = line.rstrip() + + m1 = H1_DATE_RE.match(stripped) + if m1: + report_date = m1.group(1) + i += 1 + continue + if stripped.startswith("# "): + i += 1 + continue + + if stripped.startswith("```"): + i += 1 + pre_buf = [] + while i < n and not lines[i].startswith("```"): + pre_buf.append(lines[i]) + i += 1 + if i < n: + i += 1 # closing fence + blocks.append(("pre", "\n".join(pre_buf))) + continue + + if stripped.startswith("### "): + title = stripped[4:].strip() + in_management_section = (title == "Management-Bewertung") + blocks.append(("heading", 3, title)) + i += 1 + continue + if stripped.startswith("## "): + blocks.append(("heading", 2, stripped[3:].strip())) + seen_h2 = True + in_management_section = False + i += 1 + continue + + if stripped.startswith("- "): + if in_management_section: + entries = [] + while i < n and lines[i].rstrip().startswith("- "): + body = lines[i].rstrip()[2:].strip() + if ":" in body: + lbl, val = body.split(":", 1) + val = val.strip() + val = re.sub(r"`([^`]+)`", r"\1", val) + entries.append((lbl.strip(), val)) + else: + entries.append((body, "")) + i += 1 + blocks.append(("summary_grid", entries)) + in_management_section = False + continue + + if not seen_h2: + saved_i = i + tmp_items = [] + tmp_is_meta = True + while i < n and lines[i].rstrip().startswith("- "): + body = lines[i].rstrip()[2:].strip() + if ":" not in body: + tmp_is_meta = False + break + lbl, val = body.split(":", 1) + val = val.strip() + m = re.search(r"`([^`]+)`", val) + if m: + val = m.group(1) + else: + val = val.strip("`") + tmp_items.append((lbl.strip(), val)) + i += 1 + if tmp_is_meta and tmp_items and any(lbl in META_LABELS for lbl, _ in tmp_items): + for lbl, val in tmp_items: + meta[lbl] = val + continue + i = saved_i + + items = [] + while i < n and lines[i].rstrip().startswith("- "): + items.append(lines[i].rstrip()[2:].strip()) + i += 1 + blocks.append(("ul", items)) + continue + + if stripped.startswith("|") and stripped.endswith("|"): + header_cells = [c.strip() for c in stripped.strip("|").split("|")] + i += 1 + alignments = ["left"] * len(header_cells) + if i < n: + sep = lines[i].rstrip() + if sep.startswith("|") and "-" in sep and sep.endswith("|"): + sep_cells = [c.strip() for c in sep.strip("|").split("|")] + for idx, cell in enumerate(sep_cells): + if idx >= len(alignments): + continue + if cell.startswith(":") and cell.endswith(":"): + alignments[idx] = "center" + elif cell.endswith(":"): + alignments[idx] = "right" + i += 1 + rows = [] + while i < n: + row = lines[i].rstrip() + if not (row.startswith("|") and row.endswith("|")): + break + rows.append([c.strip() for c in row.strip("|").split("|")]) + i += 1 + blocks.append(("table", header_cells, alignments, rows)) + continue + + if not stripped: + i += 1 + continue + + para_buf = [stripped] + i += 1 + while i < n: + nxt = lines[i].rstrip() + if (not nxt + or nxt.startswith("#") + or nxt.startswith("- ") + or nxt.startswith("```") + or (nxt.startswith("|") and nxt.endswith("|"))): + break + para_buf.append(nxt) + i += 1 + flush_paragraph(para_buf) + + return blocks, report_date, meta + + +# ---------- Pass 2: section wrappers ---------- + +def inject_section_wrappers(blocks): + out = [] + inside = False + for blk in blocks: + if blk[0] == "heading" and blk[1] == 2: + if inside: + out.append(("section_close",)) + out.append(("section_open", blk[2])) + inside = True + continue + out.append(blk) + if inside: + out.append(("section_close",)) + return out + + +# ---------- Pass 3: render ---------- + +def inline(text): + escaped = html.escape(text) + return INLINE_CODE_RE.sub( + lambda m: ( + f'' + f'{m.group(1)}' + ), + escaped, + ) + + +def render_hero(status, report_date, hostname, meta): + theme = STATUS_THEMES.get(status, STATUS_THEMES["UNKNOWN"]) + a, b = theme["banner_a"], theme["banner_b"] + date_label = report_date or meta.get("Erstellt", "") or "" + chips = [] + erstellt = meta.get("Erstellt", "") + zeitraum = meta.get("Zeitraum", "") + if erstellt: + chips.append(f"Erstellt {html.escape(erstellt)}") + if zeitraum: + chips.append(f"Zeitraum {html.escape(zeitraum)}") + if hostname: + chips.append(f"Host {html.escape(hostname)}") + chips_html = "" + if chips: + chips_html = ( + '
' + + "  ·  ".join(chips) + + "
" + ) + return ( + '
' + '
Homelab Operations Report
' + f'
' + f'{html.escape(date_label)}
' + '
' + '{html.escape(status)}' + '
' + f'{chips_html}' + '
' + ) + + +def render_section_open(title): + return ( + '' + '' + f'' + '
' + f'

{html.escape(title)}

' + ) + + +def render_section_close(): + return "
" + + +def render_heading(level, text): + if level == 3: + return ( + f'

' + f'{inline(text)}

' + ) + return ( + f'

{inline(text)}

' + ) + + +def render_paragraph(text): + return ( + f'

{inline(text)}

' + ) + + +def render_callout(flavor, text): + themes = { + "ok": {"bg": "#ecfdf5", "border": "#16a34a", "text": "#065f46"}, + "warn": {"bg": "#fffbeb", "border": "#d97706", "text": "#78350f"}, + "crit": {"bg": "#fef2f2", "border": "#dc2626", "text": "#7f1d1d"}, + } + t = themes.get(flavor, themes["ok"]) + return ( + f'
{inline(text)}
' + ) + + +def render_ul(items): + lis = "".join( + f'
  • {inline(it)}
  • ' + for it in items + ) + return f'' + + +def render_summary_grid(entries): + if not entries: + return "" + cards = [] + for label, value in entries: + status = classify(label, value) + theme = STATUS_THEMES.get(status, STATUS_THEMES["UNKNOWN"]) + cards.append( + '' + f'
    ' + f'
    {html.escape(label)}
    ' + f'
    ' + f'{html.escape(value)}
    ' + '
    ' + ) + rows_html = [] + for chunk_start in range(0, len(cards), 3): + chunk = cards[chunk_start:chunk_start + 3] + while len(chunk) < 3: + chunk.append('') + rows_html.append("" + "".join(chunk) + "") + return ( + '' + + "".join(rows_html) + + "
    " + ) + + +def render_table(header_cells, alignments, rows): + def is_numeric_header(h): + h_strip = h.strip().rstrip(":") + if re.search(r"(anzahl|zeilen|tage|sekunden|gestern|heute|%|nutzung|frei|resttage)$", + h_strip, re.IGNORECASE): + return True + return False + + final_aligns = [] + for idx, h in enumerate(header_cells): + if idx < len(alignments) and alignments[idx] != "left": + final_aligns.append(alignments[idx]) + elif is_numeric_header(h): + final_aligns.append("right") + else: + final_aligns.append("left") + + th_html = "".join( + f'' + f'{inline(h)}' + for h, a in zip(header_cells, final_aligns) + ) + + tr_html = [] + for idx, row in enumerate(rows): + bg = COLORS["zebra"] if idx % 2 == 1 else COLORS["card_bg"] + cells = [] + for cidx, cell in enumerate(row): + a = final_aligns[cidx] if cidx < len(final_aligns) else "left" + numeric_style = "font-variant-numeric:tabular-nums;" if a == "right" else "" + cells.append( + f'' + f'{inline(cell)}' + ) + tr_html.append( + f'' + "".join(cells) + "" + ) + + return ( + '' + f'{th_html}' + f'{"".join(tr_html)}' + '
    ' + ) + + +def render_pre(text): + return ( + f'
    ' + '
    '
    +        + html.escape(text)
    +        + '
    ' + ) + + +def render_footer(hostname): + ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ") + parts = [] + if hostname: + parts.append(f"Host {html.escape(hostname)}") + parts.append("Generator send-operations-report-mail.sh") + parts.append(f"Rendered {ts}") + return ( + f'
    ' + + "  ·  ".join(parts) + + '
    ' + ) + + +def render_blocks(blocks, status, hostname, report_date, meta): + out = [render_hero(status, report_date, hostname, meta)] + for blk in blocks: + kind = blk[0] + if kind == "section_open": + out.append(render_section_open(blk[1])) + elif kind == "section_close": + out.append(render_section_close()) + elif kind == "heading": + out.append(render_heading(blk[1], blk[2])) + elif kind == "paragraph": + out.append(render_paragraph(blk[1])) + elif kind == "callout": + out.append(render_callout(blk[1], blk[2])) + elif kind == "ul": + out.append(render_ul(blk[1])) + elif kind == "summary_grid": + out.append(render_summary_grid(blk[1])) + elif kind == "table": + out.append(render_table(blk[1], blk[2], blk[3])) + elif kind == "pre": + out.append(render_pre(blk[1])) + out.append(render_footer(hostname)) + return "\n".join(out) + + +def markdown_to_html(text, status="UNKNOWN", hostname=""): + blocks, report_date, meta = parse_blocks(text) + blocks = inject_section_wrappers(blocks) + body_html = render_blocks(blocks, status, hostname, report_date, meta) + css = ( + "body{font-family:-apple-system,BlinkMacSystemFont,'Segoe UI',Helvetica,Arial,sans-serif;" + f"line-height:1.55;color:{COLORS['text']};background:{COLORS['bg']};" + "max-width:940px;margin:24px auto;padding:0 18px}" + "*{box-sizing:border-box}" + f"a{{color:{COLORS['accent']};text-decoration:none}}" + "a:hover{text-decoration:underline}" + ) + return ( + "" + "" + "" + f"" + "" + f"{body_html}" + "" + ) + + +message = EmailMessage() +message["From"] = mail_from +message["To"] = mail_to +message["Subject"] = subject +message.set_content(report, subtype="plain", charset="utf-8") +message.add_alternative( + markdown_to_html(report, status=report_status, hostname=report_hostname), + subtype="html", + charset="utf-8", +) + +context = ssl.create_default_context() +with smtplib.SMTP(smtp_host, smtp_port, timeout=30) as smtp: + smtp.ehlo() + smtp.starttls(context=context) + smtp.ehlo() + smtp.login(smtp_user, password) + smtp.send_message(message) +PY diff --git a/services/posture-check/tests/test-log-noise-filter.sh b/services/posture-check/tests/test-log-noise-filter.sh new file mode 100644 index 0000000..f9cb330 --- /dev/null +++ b/services/posture-check/tests/test-log-noise-filter.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash +# test-log-noise-filter.sh +# +# Verifies that the log-noise filtering pipeline used by collect_log_highlights +# behaves correctly when the pattern file contains comments, empty lines and +# trailing whitespace, and that unknown error lines remain visible in attention. +# +# Run from anywhere: +# bash services/posture-check/tests/test-log-noise-filter.sh +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +NORMALIZE="$SCRIPT_DIR/../lib/normalize-noise-patterns.sh" + +if [ ! -x "$NORMALIZE" ]; then + echo "FAIL: normalize helper not executable at $NORMALIZE" >&2 + exit 1 +fi + +tmp="$(mktemp -d)" +trap 'rm -rf "$tmp"' EXIT + +# Pattern file with: comment, empty line, whitespace-only line, real patterns +# with leading and trailing whitespace, and a duplicate-shaped pattern. +cat > "$tmp/patterns" <<'EOF' +# this is a comment that must be dropped + + +monitoring-loki.*context canceled + gitea.*user/login/openid.*403 Forbidden +# another comment line that must be dropped +authelia.*Request timeout occurred.*status_code=408 +EOF + +# Hits file: 3 known-noise lines (one per real pattern), 2 unknown-error +# lines that must remain in attention. +cat > "$tmp/hits" <<'EOF' +[monitoring-loki] caller=scheduler context canceled +[gitea] router: /user/login/openid 403 Forbidden +[authelia] Request timeout occurred status_code=408 +[postgres] FATAL: connection refused for host backup-db +[traefik] error while serving request: tls handshake timeout +EOF + +# Run the same pipeline collect_log_highlights uses. +"$NORMALIZE" "$tmp/patterns" > "$tmp/patterns.norm" + +grep -Eaif "$tmp/patterns.norm" "$tmp/hits" > "$tmp/known" || true +sed -E 's/[[:space:]]+$//' "$tmp/known" > "$tmp/known.n" +sed -E 's/[[:space:]]+$//' "$tmp/hits" > "$tmp/hits.n" +grep -Fvxf "$tmp/known.n" "$tmp/hits.n" > "$tmp/attention" || true + +fail() { + echo "FAIL: $*" >&2 + echo "--- patterns.norm ---" >&2 + cat "$tmp/patterns.norm" >&2 + echo "--- known ---" >&2 + cat "$tmp/known" >&2 + echo "--- attention ---" >&2 + cat "$tmp/attention" >&2 + exit 1 +} + +# Test 1: normalize produced exactly 3 patterns (comments + empties dropped, +# whitespace trimmed). +norm_lines="$(wc -l < "$tmp/patterns.norm" | tr -d ' ')" +[ "$norm_lines" = "3" ] || fail "T1 expected 3 normalized patterns, got $norm_lines" + +# Test 2: normalize output contains no comment lines. +if grep -q '^#' "$tmp/patterns.norm"; then + fail "T2 normalized output still contains a comment line" +fi + +# Test 3: empty / whitespace-only pattern lines must NOT match all hits. +# With 3 real patterns there must be exactly 3 known-noise lines (out of 5). +known_count="$(wc -l < "$tmp/known" | tr -d ' ')" +[ "$known_count" = "3" ] || fail "T3 expected 3 known-noise hits, got $known_count" + +# Test 4: unknown error lines remain in attention (postgres + traefik). +att_count="$(wc -l < "$tmp/attention" | tr -d ' ')" +[ "$att_count" = "2" ] || fail "T4 expected 2 attention hits, got $att_count" +grep -q 'postgres.*FATAL' "$tmp/attention" || fail "T4 postgres line missing in attention" +grep -q 'traefik.*tls handshake' "$tmp/attention" || fail "T4 traefik line missing in attention" + +# Test 5: regression guard for the worst case - a pattern file containing +# ONLY empty / comment / whitespace lines must produce an empty normalized +# output AND must not knock out all hits when used as input to grep -f. +cat > "$tmp/patterns.only_empty" <<'EOF' +# only comments and whitespace below + + +# nothing real + +EOF +"$NORMALIZE" "$tmp/patterns.only_empty" > "$tmp/patterns.only_empty.norm" +empty_norm_lines="$(wc -l < "$tmp/patterns.only_empty.norm" | tr -d ' ')" +[ "$empty_norm_lines" = "0" ] || fail "T5 expected 0 normalized patterns from empty-only input, got $empty_norm_lines" + +# When the normalized file is empty, collect_log_highlights skips grep -f +# entirely. Simulate that branch and confirm attention preserves all hits. +if [ -s "$tmp/patterns.only_empty.norm" ]; then + fail "T5 expected normalized file to be empty" +fi + +echo "OK - all log-noise filter tests passed (5 assertions)"