Files
2026-05-23 11:03:02 +02:00

1202 lines
44 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_START="$(date +%s)"
REPORT_DIR="${REPORT_DIR:-/mnt/user/services/posture-check/daily-reports}"
REPORT_DATE="${REPORT_DATE:-$(date +%F)}"
REPORT_PATH="${REPORT_PATH:-$REPORT_DIR/homelab-day-$REPORT_DATE.md}"
PERSISTENT_SUMMARY_PATH="${PERSISTENT_SUMMARY_PATH:-$REPORT_DIR/summary-$REPORT_DATE.env}"
SINCE="${SINCE:-24h}"
MAX_LOG_LINES="${MAX_LOG_LINES:-80}"
CERT_MAX_ROWS="${CERT_MAX_ROWS:-12}"
IMAGE_AGE_WARN_DAYS="${IMAGE_AGE_WARN_DAYS:-180}"
LOG_VOLUME_TOP_N="${LOG_VOLUME_TOP_N:-10}"
DISK_USAGE_WARN_PCT="${DISK_USAGE_WARN_PCT:-85}"
CERT_WARN_DAYS="${CERT_WARN_DAYS:-21}"
BACKUP_DRIFT_FACTOR="${BACKUP_DRIFT_FACTOR:-2.0}"
SHOW_KNOWN_NOISE="${SHOW_KNOWN_NOISE:-0}"
SEND_MAIL="${SEND_MAIL:-0}"
MAIL_MODE="${MAIL_MODE:-always}"
MAIL_SCRIPT="${MAIL_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/send-operations-report-mail.sh}"
SEND_NTFY="${SEND_NTFY:-0}"
NTFY_TOPIC="${NTFY_TOPIC:-homelab-info}"
NTFY_SCRIPT="${NTFY_SCRIPT:-/mnt/user/services/homelab-infra/ops/restore-tests/send-ntfy.sh}"
BORG_CONTAINER="${BORG_CONTAINER:-borg-ui}"
PROMETHEUS_CONTAINER="${PROMETHEUS_CONTAINER:-monitoring-prometheus}"
TRAEFIK_ACME_PATH="${TRAEFIK_ACME_PATH:-/mnt/user/appdata/traefik/letsencrypt/acme.json}"
NOISE_PATTERNS_FILE="${NOISE_PATTERNS_FILE:-/mnt/user/services/homelab-infra/services/posture-check/log-noise.patterns}"
NORMALIZE_NOISE_SCRIPT="${NORMALIZE_NOISE_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/lib/normalize-noise-patterns.sh}"
NOISE_ESCALATION_THRESHOLD="${NOISE_ESCALATION_THRESHOLD:-500}"
NOISE_BREAKDOWN_TOP_N="${NOISE_BREAKDOWN_TOP_N:-10}"
POSTURE_CHECK_FILE="${POSTURE_CHECK_FILE:-/mnt/user/services/posture-check/last.json}"
LOCK_FILE="${LOCK_FILE:-/tmp/homelab-daily-report.lock}"
REPORT_STATUS="UNKNOWN"
exec 9>"$LOCK_FILE"
if ! flock -n 9; then
echo "Another daily-status-report run is already in progress (lock: $LOCK_FILE)" >&2
exit 3
fi
TMP_DIR="$(mktemp -d /tmp/homelab-daily-report.XXXXXX)"
BODY_PATH="$TMP_DIR/body.md"
SUMMARY_PATH="$TMP_DIR/summary.env"
SECTION_ERRORS_FILE="$TMP_DIR/section-errors.log"
: > "$BODY_PATH"
: > "$SUMMARY_PATH"
: > "$SECTION_ERRORS_FILE"
cleanup() {
rm -rf "$TMP_DIR"
}
trap cleanup EXIT
append() {
printf '%s\n' "$*" >> "$BODY_PATH"
}
append_block() {
cat >> "$BODY_PATH"
}
set_summary() {
printf '%s=%s\n' "$1" "$2" >> "$SUMMARY_PATH"
}
record_section_error() {
printf '%s: %s\n' "$1" "$2" >> "$SECTION_ERRORS_FILE"
}
have_container() {
docker inspect "$1" >/dev/null 2>&1
}
count_lines() {
wc -l | awk '{ print $1 + 0 }'
}
shorten() {
sed -E 's/[[:space:]]+/ /g' | cut -c 1-260
}
format_duration() {
local s="${1:-0}"
if ! printf '%s' "$s" | grep -Eq '^[0-9]+$'; then
printf '?\n'
return
fi
local d=$(( s / 86400 ))
local h=$(( (s % 86400) / 3600 ))
local m=$(( (s % 3600) / 60 ))
local sec=$(( s % 60 ))
if [ "$d" -gt 0 ]; then
printf '%d Tage %d Stunden\n' "$d" "$h"
elif [ "$h" -gt 0 ]; then
printf '%d Stunden %d Minuten\n' "$h" "$m"
elif [ "$m" -gt 0 ]; then
printf '%d Minuten %d Sekunden\n' "$m" "$sec"
else
printf '%d Sekunden\n' "$sec"
fi
}
collect_overview() {
local running total unhealthy exited_nonzero
total="$(docker ps -a --format '{{.Names}}' | count_lines)"
running="$(docker ps --format '{{.Names}}' | count_lines)"
unhealthy="$(docker ps --filter health=unhealthy --format '{{.Names}}' | count_lines)"
exited_nonzero="$(docker ps -a --filter status=exited --format '{{.Names}} {{.Status}}' | awk '!/Exited \(0\)/ { count++ } END { print count + 0 }')"
set_summary "containers_total" "$total"
set_summary "containers_running" "$running"
set_summary "containers_unhealthy" "$unhealthy"
set_summary "containers_exited_nonzero" "$exited_nonzero"
append "## Betriebslage"
append ""
append "- Container: $running/$total laufen"
append "- Unhealthy Container: $unhealthy"
append "- Exited non-zero Container: $exited_nonzero"
if [ -f "$POSTURE_CHECK_FILE" ]; then
local posture_status posture_age now_epoch
posture_status="$(sed -n 's/.*"status": *"\([^"]*\)".*/\1/p' "$POSTURE_CHECK_FILE" | head -n 1)"
now_epoch="$(date +%s)"
posture_age=$(( now_epoch - $(stat -c %Y "$POSTURE_CHECK_FILE" 2>/dev/null || echo "$now_epoch") ))
append "- Letzter Posture-Check: ${posture_status:-unbekannt} (Datei ist $(format_duration "$posture_age") alt)"
set_summary "posture_status" "${posture_status:-unknown}"
set_summary "posture_age_seconds" "$posture_age"
else
append "- Letzter Posture-Check: keine Datei gefunden"
set_summary "posture_status" "missing"
record_section_error "overview" "Posture-Check-Datei $POSTURE_CHECK_FILE fehlt"
fi
append ""
}
collect_host_health() {
append "## Host"
append ""
local boot_epoch boot_iso uptime_seconds load_1 load_5 load_15 now_epoch
now_epoch="$(date +%s)"
boot_epoch="$(awk '/^btime/ { print $2 }' /proc/stat 2>/dev/null || echo 0)"
if [ "${boot_epoch:-0}" -gt 0 ]; then
boot_iso="$(date -u -d "@$boot_epoch" -Iseconds 2>/dev/null || echo unknown)"
uptime_seconds=$(( now_epoch - boot_epoch ))
else
boot_iso="unknown"
uptime_seconds=0
record_section_error "host" "/proc/stat btime nicht lesbar"
fi
if [ "$uptime_seconds" -lt 0 ]; then
uptime_seconds=0
fi
if [ -r /proc/loadavg ]; then
read -r load_1 load_5 load_15 _ < /proc/loadavg
else
load_1="?"; load_5="?"; load_15="?"
fi
append "- Hostname: \`$(hostname)\`"
append "- Boot-Zeit: \`$boot_iso\`"
append "- Uptime: $(format_duration "$uptime_seconds")"
append "- Load average (1/5/15): $load_1 / $load_5 / $load_15"
if [ "$uptime_seconds" -lt 86400 ]; then
append "- WARNUNG: Boot innerhalb der letzten 24 Stunden erkannt."
set_summary "host_recent_boot" "1"
else
append "- Reboot in den letzten 24h: nein"
set_summary "host_recent_boot" "0"
fi
set_summary "host_uptime_seconds" "$uptime_seconds"
set_summary "host_load_1" "$load_1"
append ""
}
derive_report_status() {
# shellcheck disable=SC1090
. "$SUMMARY_PATH"
REPORT_STATUS="OK"
local has_warn=0 has_crit=0
[ "${borg_status:-unknown}" != "completed" ] && has_warn=1
[ "${prometheus_alerts:-0}" = "unknown" ] && has_warn=1
[ "${cert_warnings:-0}" != "0" ] && has_warn=1
[ "${disk_warnings:-0}" != "0" ] && has_warn=1
[ "${image_warnings:-0}" != "0" ] && has_warn=1
[ "${containers_exited_nonzero:-0}" != "0" ] && has_warn=1
[ "${host_recent_boot:-0}" = "1" ] && has_warn=1
[ "${backup_duration_drift:-0}" = "1" ] && has_warn=1
[ "${noise_threshold_exceeded:-0}" != "0" ] && has_warn=1
if [ "${prometheus_alerts_pending:-0}" != "0" ] && [ "${prometheus_alerts_pending:-0}" != "unknown" ]; then
has_warn=1
fi
[ "${borg_status:-unknown}" = "failed" ] && has_crit=1
[ "${borg_status:-unknown}" = "error" ] && has_crit=1
[ "${containers_unhealthy:-0}" != "0" ] && has_crit=1
if [ "${prometheus_alerts_firing:-0}" != "0" ] && [ "${prometheus_alerts_firing:-0}" != "unknown" ]; then
has_crit=1
fi
if [ "$has_crit" -eq 1 ]; then
REPORT_STATUS="KRITISCH"
elif [ "$has_warn" -eq 1 ]; then
REPORT_STATUS="WARNUNG"
fi
set_summary "report_status" "$REPORT_STATUS"
}
collect_borg() {
append "## Borg Backup"
append ""
if ! have_container "$BORG_CONTAINER"; then
append "- WARNUNG: Container \`$BORG_CONTAINER\` nicht gefunden."
append ""
set_summary "borg_status" "unknown"
set_summary "backup_duration_drift" "unknown"
record_section_error "borg" "Container $BORG_CONTAINER nicht gefunden"
return
fi
if ! docker exec -i "$BORG_CONTAINER" python3 - <<'PY' >> "$BODY_PATH"
import sqlite3
def fmt_bytes(value):
if value is None:
return "-"
value = float(value)
units = ["B", "KB", "MB", "GB", "TB"]
for unit in units:
if value < 1024 or unit == units[-1]:
return f"{value:.1f} {unit}" if unit != "B" else f"{int(value)} B"
value /= 1024
def fmt_sec(s):
s = int(s)
h, rem = divmod(s, 3600)
m, sec = divmod(rem, 60)
if h > 0:
return f"{h}h {m}m"
return f"{m}m {sec}s"
conn = sqlite3.connect("/data/borg.db")
conn.row_factory = sqlite3.Row
cur = conn.cursor()
print("### Letzte Backup-Jobs")
rows = cur.execute("""
select id, status, started_at, completed_at, archive_name, nfiles,
original_size, compressed_size, deduplicated_size, error_message
from backup_jobs
where started_at >= datetime('now', '-30 hours')
or created_at >= datetime('now', '-30 hours')
order by coalesce(started_at, created_at) desc
limit 8
""").fetchall()
if not rows:
print("- WARNUNG: Kein Backup-Job in den letzten 30 Stunden gefunden.")
else:
print("| Zeit UTC | Status | Archiv | Dateien | Original | Dedupliziert |")
print("|---|---:|---|---:|---:|---:|")
for row in rows:
archive = row["archive_name"] or "-"
if len(archive) > 54:
archive = archive[:51] + "..."
print(
f"| {row['started_at'] or row['completed_at'] or '-'} "
f"| {row['status']} "
f"| {archive} "
f"| {row['nfiles'] if row['nfiles'] is not None else '-'} "
f"| {fmt_bytes(row['original_size'])} "
f"| {fmt_bytes(row['deduplicated_size'])} |"
)
if row["error_message"]:
print(f" - Fehler: {row['error_message'][:240]}")
print("")
print("### Zeitplan")
for row in cur.execute("""
select name, enabled, last_run, next_run, cron_expression
from scheduled_jobs
order by id
"""):
enabled = "aktiv" if row["enabled"] else "pausiert"
print(f"- {row['name']}: {enabled}, last={row['last_run'] or '-'}, next={row['next_run'] or '-'}, cron=`{row['cron_expression']}`")
print("")
print("### Dauer-Drift (Median 14 Tage)")
duration_rows = cur.execute("""
select started_at, completed_at,
(julianday(completed_at) - julianday(started_at)) * 86400 as duration_seconds
from backup_jobs
where status = 'completed'
and started_at is not null
and completed_at is not null
and completed_at >= datetime('now', '-14 days')
order by completed_at desc
""").fetchall()
durations = [r["duration_seconds"] for r in duration_rows if r["duration_seconds"] and r["duration_seconds"] > 0]
if len(durations) < 3:
print(f"- Zu wenig Datenpunkte fuer eine Drift-Bewertung (n={len(durations)}).")
else:
durations_sorted = sorted(durations)
median = durations_sorted[len(durations_sorted) // 2]
latest = durations[0]
ratio = latest / median if median > 0 else 0
print(f"- Letzter Lauf: {fmt_sec(latest)}")
print(f"- Median 14 Tage: {fmt_sec(median)} (n={len(durations)})")
print(f"- Verhaeltnis: {ratio:.2f}x")
if ratio > 2.0:
print(f"- Bewertung: Drift erkannt - letzter Lauf {ratio:.1f}x langsamer als der Median. Quellgroesse, IO und Repo-Zustand pruefen.")
else:
print("- Bewertung: Backup-Dauer im erwarteten Bereich.")
PY
then
append "- WARNUNG: Borg-Auswertung fehlgeschlagen."
set_summary "borg_status" "unknown"
set_summary "backup_duration_drift" "unknown"
record_section_error "borg" "Python-Auswertung in $BORG_CONTAINER fehlgeschlagen"
else
local borg_out borg_status borg_drift
borg_out="$(docker exec -i "$BORG_CONTAINER" python3 - <<'PY' 2>/dev/null || true
import sqlite3
conn = sqlite3.connect("/data/borg.db")
conn.row_factory = sqlite3.Row
cur = conn.cursor()
status_row = cur.execute("""
select status
from backup_jobs
order by coalesce(started_at, created_at) desc
limit 1
""").fetchone()
status = status_row[0] if status_row else "missing"
duration_rows = cur.execute("""
select (julianday(completed_at) - julianday(started_at)) * 86400 as ds
from backup_jobs
where status = 'completed'
and started_at is not null
and completed_at is not null
and completed_at >= datetime('now', '-14 days')
order by completed_at desc
""").fetchall()
durations = [r[0] for r in duration_rows if r[0] and r[0] > 0]
if len(durations) < 3:
drift = "insufficient"
else:
median = sorted(durations)[len(durations)//2]
latest = durations[0]
ratio = latest / median if median > 0 else 0
drift = "1" if ratio > 2.0 else "0"
print(f"status={status}")
print(f"drift={drift}")
PY
)"
borg_status="$(printf '%s' "$borg_out" | sed -n 's/^status=//p' | head -n 1)"
borg_drift="$(printf '%s' "$borg_out" | sed -n 's/^drift=//p' | head -n 1)"
if [ "${borg_drift:-}" = "1" ]; then
set_summary "backup_duration_drift" "1"
elif [ "${borg_drift:-}" = "0" ]; then
set_summary "backup_duration_drift" "0"
else
set_summary "backup_duration_drift" "unknown"
fi
set_summary "borg_status" "${borg_status:-unknown}"
fi
append ""
}
collect_prometheus() {
append "## Prometheus Alerts"
append ""
if ! have_container "$PROMETHEUS_CONTAINER"; then
append "- WARNUNG: Container \`$PROMETHEUS_CONTAINER\` nicht gefunden."
append ""
set_summary "prometheus_alerts" "unknown"
set_summary "prometheus_alerts_firing" "unknown"
set_summary "prometheus_alerts_pending" "unknown"
record_section_error "prometheus" "Container $PROMETHEUS_CONTAINER nicht gefunden"
return
fi
local alerts
alerts="$(docker exec "$PROMETHEUS_CONTAINER" wget -qO- http://localhost:9090/api/v1/alerts 2>/dev/null || true)"
if [ -z "$alerts" ]; then
append "- WARNUNG: Prometheus Alerts API nicht erreichbar."
set_summary "prometheus_alerts" "unknown"
set_summary "prometheus_alerts_firing" "unknown"
set_summary "prometheus_alerts_pending" "unknown"
record_section_error "prometheus" "Alerts-API leer oder nicht erreichbar"
elif printf '%s' "$alerts" | grep -q '"alerts":\[\]'; then
append "- Keine aktiven Alerts."
set_summary "prometheus_alerts" "0"
set_summary "prometheus_alerts_firing" "0"
set_summary "prometheus_alerts_pending" "0"
else
local total firing pending
total="$(printf '%s' "$alerts" | grep -o '"alertname":"[^"]*"' | count_lines)"
firing="$(printf '%s' "$alerts" | grep -o '"state":"firing"' | count_lines)"
pending="$(printf '%s' "$alerts" | grep -o '"state":"pending"' | count_lines)"
append "- Aktive Alerts insgesamt: $total"
append "- Davon firing: $firing"
append "- Davon pending: $pending"
append ""
append "### Details"
printf '%s' "$alerts" \
| grep -o '"alertname":"[^"]*"\|"severity":"[^"]*"\|"instance":"[^"]*"\|"service":"[^"]*"\|"state":"[^"]*"' \
| sed 's/^/ - /' >> "$BODY_PATH"
set_summary "prometheus_alerts" "$total"
set_summary "prometheus_alerts_firing" "$firing"
set_summary "prometheus_alerts_pending" "$pending"
fi
append ""
}
collect_certificate_health() {
append "## Zertifikate"
append ""
local cert_file="$TMP_DIR/certificates.tsv"
local cert_sorted="$TMP_DIR/certificates.sorted.tsv"
local warning_count=0
local total_count=0
: > "$cert_file"
if [ ! -f "$TRAEFIK_ACME_PATH" ]; then
append "- WARNUNG: Traefik ACME-Datei nicht gefunden: $TRAEFIK_ACME_PATH"
set_summary "cert_warnings" "1"
record_section_error "certificates" "ACME-Datei $TRAEFIK_ACME_PATH fehlt"
append ""
return
fi
if docker run -i --rm \
-v "$TRAEFIK_ACME_PATH:/acme.json:ro" \
python:3.13-alpine python - <<'PY' > "$cert_file"
import base64
import json
import ssl
import tempfile
from datetime import datetime, timezone
with open("/acme.json", "r", encoding="utf-8") as handle:
data = json.load(handle)
now = datetime.now(timezone.utc)
for resolver in data.values():
for cert in resolver.get("Certificates", []):
domain = cert.get("domain", {}).get("main") or "-"
sans = cert.get("domain", {}).get("sans") or []
cert_b64 = cert.get("certificate")
if not cert_b64:
continue
pem = base64.b64decode(cert_b64)
with tempfile.NamedTemporaryFile(delete=False) as tmp:
tmp.write(pem)
tmp_path = tmp.name
decoded = ssl._ssl._test_decode_cert(tmp_path)
not_after = datetime.strptime(decoded["notAfter"], "%b %d %H:%M:%S %Y %Z").replace(tzinfo=timezone.utc)
days = (not_after - now).days
names = ", ".join([domain, *sans])
print(f"{days}\t{not_after.date().isoformat()}\t{names}")
PY
then
if [ ! -s "$cert_file" ]; then
append "- WARNUNG: Keine Zertifikate in ACME-Datei gefunden."
warning_count=1
record_section_error "certificates" "ACME-Datei enthielt keine Zertifikate"
else
sort -n "$cert_file" > "$cert_sorted"
total_count="$(count_lines < "$cert_sorted")"
append "- Zertifikate gesamt: $total_count"
append "- Anzeige: die $CERT_MAX_ROWS Zertifikate mit der kuerzesten Restlaufzeit"
append "- Schwelle Warnung: weniger als $CERT_WARN_DAYS Tage"
append ""
append "| Resttage | Ablaufdatum UTC | Domains |"
append "|---:|---|---|"
while IFS="$(printf '\t')" read -r days expires domains; do
append "| $days | $expires | $domains |"
if [ "${days:-0}" -lt "$CERT_WARN_DAYS" ]; then
warning_count=$((warning_count + 1))
fi
done < <(head -n "$CERT_MAX_ROWS" "$cert_sorted")
while IFS="$(printf '\t')" read -r days _expires _domains; do
if [ "${days:-0}" -lt "$CERT_WARN_DAYS" ]; then
warning_count=$((warning_count + 1))
fi
done < <(tail -n +"$((CERT_MAX_ROWS + 1))" "$cert_sorted")
append ""
if [ "$warning_count" -eq 0 ]; then
append "Bewertung: Keine Zertifikate im kritischen Erneuerungsfenster unter $CERT_WARN_DAYS Tagen."
else
append "Bewertung: $warning_count Zertifikat(e) laufen in weniger als $CERT_WARN_DAYS Tagen ab und sollten beobachtet werden."
fi
fi
else
append "- WARNUNG: Zertifikate konnten nicht aus ACME-Datei gelesen werden."
warning_count=1
record_section_error "certificates" "Auswertung der ACME-Datei fehlgeschlagen"
fi
set_summary "cert_warnings" "$warning_count"
append ""
}
collect_disk_health() {
append "## Storage / Filesystem"
append ""
local disk_warnings=0
local paths="/mnt/cache /mnt/disk1 /mnt/user /mnt/user/appdata /mnt/user/backups"
append "- Schwelle Warnung: Nutzung ab ${DISK_USAGE_WARN_PCT}%"
append ""
append "| Pfad | Filesystem | Nutzung | Frei | Bewertung |"
append "|---|---|---:|---:|---|"
for path in $paths; do
if [ ! -e "$path" ]; then
append "| $path | - | - | - | fehlt |"
disk_warnings=$((disk_warnings + 1))
record_section_error "disk" "Kernpfad $path fehlt"
continue
fi
local fstype usage avail verdict
fstype="$(findmnt -T "$path" -no FSTYPE 2>/dev/null | head -n 1 || true)"
usage="$(df -P "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"
avail="$(df -hP "$path" 2>/dev/null | awk 'NR==2 { print $4 }')"
verdict="ok"
if ! printf '%s' "${usage:-}" | grep -Eq '^[0-9]+$'; then
usage="-"
verdict="unbekannt"
disk_warnings=$((disk_warnings + 1))
elif [ "$usage" -ge "$DISK_USAGE_WARN_PCT" ]; then
verdict="Warnung: >=${DISK_USAGE_WARN_PCT}%"
disk_warnings=$((disk_warnings + 1))
fi
append "| $path | ${fstype:-unbekannt} | ${usage}% | ${avail:-?} | $verdict |"
done
append ""
if [ "$disk_warnings" -eq 0 ]; then
append "Bewertung: Keine kritischen Fuellstaende oder fehlenden Kernpfade erkannt."
else
append "Bewertung: $disk_warnings Storage-/Filesystem-Punkt(e) brauchen Aufmerksamkeit."
fi
set_summary "disk_warnings" "$disk_warnings"
append ""
}
collect_image_freshness() {
append "## Image-Aktualitaet"
append ""
local image_file="$TMP_DIR/images.tsv"
local image_warnings=0
local now_epoch
: > "$image_file"
now_epoch="$(date +%s)"
while IFS= read -r name; do
[ -n "$name" ] || continue
local image_id created_iso created_epoch age_days image_tag
image_id="$(docker inspect --format '{{.Image}}' "$name" 2>/dev/null || true)"
[ -n "$image_id" ] || continue
created_iso="$(docker image inspect --format '{{.Created}}' "$image_id" 2>/dev/null || true)"
image_tag="$(docker inspect --format '{{.Config.Image}}' "$name" 2>/dev/null || echo '?')"
[ -n "$created_iso" ] || continue
created_epoch="$(date -d "$created_iso" +%s 2>/dev/null || echo 0)"
[ "$created_epoch" -gt 0 ] || continue
age_days=$(( (now_epoch - created_epoch) / 86400 ))
printf '%d\t%s\t%s\n' "$age_days" "$name" "$image_tag" >> "$image_file"
if [ "$age_days" -ge "$IMAGE_AGE_WARN_DAYS" ]; then
image_warnings=$((image_warnings + 1))
fi
done < <(docker ps --format '{{.Names}}')
set_summary "image_warnings" "$image_warnings"
if [ ! -s "$image_file" ]; then
append "- Keine Image-Daten verfuegbar."
record_section_error "images" "Keine Image-Daten ermittelt"
else
append "- Schwelle Warnung: Image aelter als $IMAGE_AGE_WARN_DAYS Tage"
append "- Container mit Image >= $IMAGE_AGE_WARN_DAYS Tage: $image_warnings"
append ""
append "### Aelteste Images (Top 10)"
append ""
append "| Alter Tage | Container | Image |"
append "|---:|---|---|"
sort -nr "$image_file" | head -n 10 | while IFS="$(printf '\t')" read -r age name img; do
append "| $age | $name | $img |"
done
append ""
if [ "$image_warnings" -eq 0 ]; then
append "Bewertung: Keine Container mit ueberalterten Images. CVE-Hygiene aus dieser Sicht ok."
else
append "Bewertung: $image_warnings Container nutzen Images aelter als $IMAGE_AGE_WARN_DAYS Tage. Update-Pipeline und CVE-Status pruefen."
fi
fi
append ""
}
collect_container_events() {
append "## Docker Events ($SINCE)"
append ""
local events_file="$TMP_DIR/docker-events.log"
timeout 20 docker events \
--since "$SINCE" \
--until "$(date -Iseconds)" \
--filter event=die \
--filter event=oom \
--filter event=kill \
--filter event=restart \
--format '{{.Time}}|{{.Actor.Attributes.name}}|{{.Action}}|{{.Actor.Attributes.exitCode}}|{{.Actor.Attributes.image}}' \
| awk -F '|' '!(($3 == "die") && ($4 == "0")) { print }' \
> "$events_file" 2>/dev/null || true
local event_count
event_count="$(count_lines < "$events_file")"
set_summary "docker_events" "$event_count"
if [ "$event_count" -eq 0 ]; then
append '- Keine `die`/`oom`/`kill`/`restart` Events im Zeitraum.'
else
append "- Relevante Events: $event_count"
append ""
append '```text'
tail -n 80 "$events_file" >> "$BODY_PATH"
append '```'
fi
append ""
}
collect_container_state() {
append "## Container-Zustand"
append ""
append "### Nicht laufende Container"
local stopped_file="$TMP_DIR/stopped.log"
docker ps -a --filter status=exited --filter status=dead --filter status=created --format '{{.Names}}\t{{.Status}}' > "$stopped_file"
if [ ! -s "$stopped_file" ]; then
append "- Keine."
else
append '```text'
cat "$stopped_file" >> "$BODY_PATH"
append '```'
fi
append ""
append "### Container mit RestartCount > 0"
local restart_file="$TMP_DIR/restarts.log"
: > "$restart_file"
while IFS= read -r name; do
[ -n "$name" ] || continue
local count
count="$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo 0)"
if [ "${count:-0}" -gt 0 ]; then
printf '%s\t%s\n' "$name" "$count" >> "$restart_file"
fi
done < <(docker ps -a --format '{{.Names}}')
if [ ! -s "$restart_file" ]; then
append "- Keine."
else
append '```text'
sort -k2,2nr "$restart_file" >> "$BODY_PATH"
append '```'
fi
append ""
}
collect_traefik_5xx() {
append "## Traefik 5xx ($SINCE)"
append ""
if ! have_container traefik; then
append "- Traefik-Container nicht gefunden."
append ""
record_section_error "traefik" "Container traefik nicht gefunden"
return
fi
local file="$TMP_DIR/traefik-5xx.log"
docker logs --since "$SINCE" traefik 2>&1 \
| awk '$9 ~ /^5[0-9][0-9]$/ { print }' \
> "$file" || true
local count
count="$(count_lines < "$file")"
set_summary "traefik_5xx" "$count"
if [ "$count" -eq 0 ]; then
append "- Keine 5xx-Antworten."
else
append "- 5xx-Antworten: $count"
append ""
append "### Gruppiert nach Service/Code"
append '```text'
awk '{ code=$9; service=$12; gsub(/"/, "", service); counts[service " " code]++ } END { for (k in counts) print counts[k], k }' "$file" | sort -nr >> "$BODY_PATH"
append '```'
append ""
append "### Letzte Zeilen"
append '```text'
tail -n "$MAX_LOG_LINES" "$file" >> "$BODY_PATH"
append '```'
fi
append ""
}
collect_log_highlights() {
append "## Log-Auswertung ($SINCE)"
append ""
append "Ziel dieses Abschnitts ist nicht, Rohlogs zu wiederholen, sondern handlungsrelevante Auffaelligkeiten auszusortieren."
append ""
local hits="$TMP_DIR/log-hits.log"
local attention="$TMP_DIR/log-attention.log"
local known_noise="$TMP_DIR/log-known-noise.log"
: > "$hits"
: > "$attention"
: > "$known_noise"
while IFS= read -r name; do
[ -n "$name" ] || continue
docker logs --since "$SINCE" "$name" 2>&1 \
| grep -Eai 'error|fatal|panic|exception|failed|denied|unauthorized|forbidden|oom' \
| grep -Eavi 'level=info|levelname.: .INFO| 200 OK| 404 Not Found|healthcheck|probe_success' \
| grep -Eavi 'production.DEBUG|stats_refresh_scheduler.*errors.: 0|Sync completed.*Failed: 0' \
| sed -E 's/(refresh_token: )[A-Za-z0-9._-]+/\1[REDACTED]/Ig; s/(token: )[A-Za-z0-9._-]+/\1[REDACTED]/Ig; s/(Authorization: )[A-Za-z0-9._ -]+/\1[REDACTED]/Ig' \
| sed "s/^/[$name] /" >> "$hits" || true
done < <(docker ps --format '{{.Names}}')
# Normalize the noise pattern file (drop comments, empty lines, trim
# whitespace). An empty or whitespace-only pattern line would otherwise
# make grep -Eaif match every hit and silently wipe the log highlights.
local noise_normalized="$TMP_DIR/noise.patterns.normalized"
: > "$noise_normalized"
if [ -f "$NOISE_PATTERNS_FILE" ]; then
if [ -x "$NORMALIZE_NOISE_SCRIPT" ]; then
"$NORMALIZE_NOISE_SCRIPT" "$NOISE_PATTERNS_FILE" > "$noise_normalized" 2>/dev/null || : > "$noise_normalized"
else
record_section_error "log-highlights" "Normalize-Helper fehlt oder nicht ausfuehrbar: $NORMALIZE_NOISE_SCRIPT - Noise-Patterns ungenormt verwendet"
# Fallback inline (same logic as the helper) so we still avoid the
# "empty line matches all" trap.
grep -Ev '^[[:space:]]*(#|$)' "$NOISE_PATTERNS_FILE" 2>/dev/null \
| sed -E 's/^[[:space:]]+//; s/[[:space:]]+$//' \
| grep -v '^$' > "$noise_normalized" || : > "$noise_normalized"
fi
else
record_section_error "log-highlights" "Noise-Pattern-Datei $NOISE_PATTERNS_FILE fehlt - alle Treffer gelten als handlungsrelevant"
fi
if [ -s "$hits" ]; then
if [ -s "$noise_normalized" ]; then
grep -Eaif "$noise_normalized" "$hits" > "$known_noise" || true
fi
if [ -s "$known_noise" ]; then
# Normalisierung gegen abweichende Whitespace-Enden
sed -E 's/[[:space:]]+$//' "$known_noise" > "$known_noise.norm"
sed -E 's/[[:space:]]+$//' "$hits" > "$hits.norm"
grep -Fvxf "$known_noise.norm" "$hits.norm" > "$attention" || true
else
cp "$hits" "$attention"
fi
fi
# Per-container noise breakdown (always computed, even if SHOW_KNOWN_NOISE=0).
local noise_by_container="$TMP_DIR/noise-by-container.tsv"
: > "$noise_by_container"
if [ -s "$known_noise" ]; then
awk -F '[][]' '{ counts[$2]++ } END { for (n in counts) print counts[n] "\t" n }' "$known_noise" \
| sort -nr > "$noise_by_container"
fi
# Per-pattern noise breakdown: count how often each pattern hit in $hits.
# Note: a single hit line may match multiple patterns; counts can overlap.
local noise_by_pattern="$TMP_DIR/noise-by-pattern.tsv"
: > "$noise_by_pattern"
if [ -s "$noise_normalized" ] && [ -s "$hits" ]; then
while IFS= read -r p; do
[ -n "$p" ] || continue
local pcount
pcount="$(grep -Eaic -- "$p" "$hits" 2>/dev/null || true)"
if [ "${pcount:-0}" -gt 0 ]; then
printf '%d\t%s\n' "$pcount" "$p" >> "$noise_by_pattern"
fi
done < "$noise_normalized"
if [ -s "$noise_by_pattern" ]; then
sort -nr -o "$noise_by_pattern" "$noise_by_pattern"
fi
fi
# Threshold escalation: how many patterns produced more than the threshold?
local noise_threshold_exceeded=0
if [ -s "$noise_by_pattern" ]; then
noise_threshold_exceeded="$(awk -v t="$NOISE_ESCALATION_THRESHOLD" '$1 > t { n++ } END { print n + 0 }' "$noise_by_pattern")"
fi
set_summary "noise_threshold_exceeded" "$noise_threshold_exceeded"
local hit_count attention_count known_noise_count
hit_count="$(count_lines < "$hits")"
attention_count="$(count_lines < "$attention")"
known_noise_count="$(count_lines < "$known_noise")"
set_summary "log_highlights" "$attention_count"
set_summary "log_hits_total" "$hit_count"
set_summary "log_known_noise" "$known_noise_count"
if [ "$hit_count" -eq 0 ]; then
append "- Keine auffaelligen Logmuster gefunden."
else
append "- Gefundene Logmuster insgesamt: $hit_count"
append "- Davon als bekanntes Rauschen eingeordnet: $known_noise_count"
append "- Handlungsrelevante Logmuster: $attention_count"
append "- Noise-Pattern-Quelle: \`$NOISE_PATTERNS_FILE\`"
append "- Eskalations-Schwelle pro Pattern: $NOISE_ESCALATION_THRESHOLD"
if [ "$noise_threshold_exceeded" -gt 0 ]; then
append "- WARNUNG: $noise_threshold_exceeded Pattern ueberschreit(en) die Schwelle - bitte pruefen ob noch wirklich Noise."
fi
append ""
if [ "$attention_count" -eq 0 ]; then
append "Bewertung: Keine handlungsrelevanten Logmuster. Die Treffer bestehen aus bekannten, aktuell nicht kritischen Meldungen."
else
append "Bewertung: Es gibt Logmuster, die nicht automatisch als bekanntes Rauschen eingeordnet wurden. Diese sollten geprueft werden."
append ""
append "### Betroffene Container"
append ""
append "| Container | Anzahl |"
append "|---|---:|"
awk -F '[][]' '{ counts[$2]++ } END { for (name in counts) print "| " name " | " counts[name] " |" }' "$attention" | sort >> "$BODY_PATH"
append ""
append "### Beispiele"
append ""
append '```text'
awk -F '[][]' '
{
name=$2
if (seen[name] < 3) {
line=$0
gsub(/[[:space:]]+/, " ", line)
if (length(line) > 220) line=substr(line, 1, 217) "..."
print line
seen[name]++
}
}
' "$attention" | head -n "$MAX_LOG_LINES" >> "$BODY_PATH"
append '```'
fi
if [ "$known_noise_count" -gt 0 ]; then
append ""
append "### Bekanntes Rauschen (Top)"
append ""
if [ -s "$noise_by_container" ]; then
append "#### Container mit den meisten Noise-Treffern"
append ""
append "| Container | Anzahl |"
append "|---|---:|"
head -n "$NOISE_BREAKDOWN_TOP_N" "$noise_by_container" \
| while IFS="$(printf '\t')" read -r cnt cname; do
append "| ${cname:-?} | $cnt |"
done
append ""
fi
if [ -s "$noise_by_pattern" ]; then
append "#### Pattern mit den meisten Treffern"
append ""
append "| Pattern | Anzahl |"
append "|---|---:|"
head -n "$NOISE_BREAKDOWN_TOP_N" "$noise_by_pattern" \
| while IFS="$(printf '\t')" read -r cnt pat; do
local short="$pat"
if [ "${#short}" -gt 80 ]; then
short="${short:0:77}..."
fi
# Escape pipe characters that would break the markdown table.
short="${short//|/\\|}"
append "| \`$short\` | $cnt |"
done
append ""
fi
if [ "$noise_threshold_exceeded" -gt 0 ]; then
append "Bewertung: $noise_threshold_exceeded Pattern ueberschreit(en) die Eskalations-Schwelle ($NOISE_ESCALATION_THRESHOLD). Bitte pruefen, ob die als Noise eingeordneten Meldungen noch fachlich Noise sind oder ob sich ein echter Vorfall darunter versteckt."
else
append "Bewertung: Kein Pattern ueberschreitet die Eskalations-Schwelle ($NOISE_ESCALATION_THRESHOLD)."
fi
fi
if [ "$known_noise_count" -gt 0 ] && [ "$SHOW_KNOWN_NOISE" = "1" ]; then
append ""
append "### Ausgeblendetes bekanntes Rauschen (Top 50 Zeilen)"
append ""
append '```text'
head -n 50 "$known_noise" >> "$BODY_PATH"
append '```'
fi
fi
append ""
}
collect_log_volume() {
append "## Log-Volumen ($SINCE)"
append ""
local volume_file="$TMP_DIR/log-volume.tsv"
: > "$volume_file"
while IFS= read -r name; do
[ -n "$name" ] || continue
local count
count="$(docker logs --since "$SINCE" "$name" 2>&1 | count_lines)"
printf '%d\t%s\n' "$count" "$name" >> "$volume_file"
done < <(docker ps --format '{{.Names}}')
local total
total="$(awk '{ s += $1 } END { print s + 0 }' "$volume_file")"
set_summary "log_volume_total" "$total"
if [ "$total" -eq 0 ]; then
append "- Keine Logzeilen im Zeitraum (unwahrscheinlich, evtl. Datenquelle pruefen)."
record_section_error "log-volume" "Log-Volumen ueber alle Container ist 0"
else
append "- Zeilen insgesamt im Zeitraum: $total"
append ""
append "### Top $LOG_VOLUME_TOP_N lauteste Container"
append ""
append "| Container | Zeilen |"
append "|---|---:|"
sort -nr "$volume_file" | head -n "$LOG_VOLUME_TOP_N" | while IFS="$(printf '\t')" read -r c n; do
append "| $n | $c |"
done
append ""
append "Bewertung: Auffaellig laute Container sind oft ein Frueh-Indikator fuer Endlosschleifen, schlecht konfigurierte Loglevel oder Probe-Spam."
fi
append ""
}
collect_diff_yesterday() {
append "## Vergleich mit gestern"
append ""
local yesterday yesterday_summary
yesterday="$(date -d 'yesterday' +%F 2>/dev/null || true)"
yesterday_summary="$REPORT_DIR/summary-$yesterday.env"
if [ -z "$yesterday" ] || [ ! -f "$yesterday_summary" ]; then
append "- Keine Vortagsdaten verfuegbar ($yesterday_summary)."
append ""
return
fi
local prev_borg= prev_alerts= prev_firing= prev_pending= prev_unhealthy= prev_exited= prev_5xx= prev_events= prev_log= prev_certs= prev_disk= prev_img= prev_drift= prev_vol=
while IFS='=' read -r key value; do
case "$key" in
borg_status) prev_borg="$value" ;;
prometheus_alerts) prev_alerts="$value" ;;
prometheus_alerts_firing) prev_firing="$value" ;;
prometheus_alerts_pending) prev_pending="$value" ;;
containers_unhealthy) prev_unhealthy="$value" ;;
containers_exited_nonzero) prev_exited="$value" ;;
traefik_5xx) prev_5xx="$value" ;;
docker_events) prev_events="$value" ;;
log_highlights) prev_log="$value" ;;
cert_warnings) prev_certs="$value" ;;
disk_warnings) prev_disk="$value" ;;
image_warnings) prev_img="$value" ;;
backup_duration_drift) prev_drift="$value" ;;
log_volume_total) prev_vol="$value" ;;
esac
done < "$yesterday_summary"
# shellcheck disable=SC1090
. "$SUMMARY_PATH"
append "Vergleich des Datums $REPORT_DATE mit $yesterday."
append ""
append "| Metrik | Heute | Gestern |"
append "|---|---:|---:|"
append "| Borg Status | ${borg_status:-?} | ${prev_borg:-?} |"
append "| Prometheus Alerts gesamt | ${prometheus_alerts:-?} | ${prev_alerts:-?} |"
append "| Prometheus firing | ${prometheus_alerts_firing:-?} | ${prev_firing:-?} |"
append "| Prometheus pending | ${prometheus_alerts_pending:-?} | ${prev_pending:-?} |"
append "| Container unhealthy | ${containers_unhealthy:-?} | ${prev_unhealthy:-?} |"
append "| Container exited non-zero | ${containers_exited_nonzero:-?} | ${prev_exited:-?} |"
append "| Docker Events | ${docker_events:-?} | ${prev_events:-?} |"
append "| Traefik 5xx | ${traefik_5xx:-?} | ${prev_5xx:-?} |"
append "| Log-Highlights | ${log_highlights:-?} | ${prev_log:-?} |"
append "| Log-Volumen | ${log_volume_total:-?} | ${prev_vol:-?} |"
append "| Zertifikatswarnungen | ${cert_warnings:-?} | ${prev_certs:-?} |"
append "| Storage-Warnungen | ${disk_warnings:-?} | ${prev_disk:-?} |"
append "| Image-Warnungen | ${image_warnings:-?} | ${prev_img:-?} |"
append "| Backup-Dauer-Drift | ${backup_duration_drift:-?} | ${prev_drift:-?} |"
append ""
local notable=0
if [ "${containers_exited_nonzero:-0}" != "${prev_exited:-0}" ] || \
[ "${containers_unhealthy:-0}" != "${prev_unhealthy:-0}" ] || \
[ "${prometheus_alerts_firing:-0}" != "${prev_firing:-0}" ] || \
[ "${prometheus_alerts_pending:-0}" != "${prev_pending:-0}" ] || \
[ "${log_highlights:-0}" != "${prev_log:-0}" ] || \
[ "${borg_status:-unknown}" != "${prev_borg:-unknown}" ] || \
[ "${backup_duration_drift:-0}" != "${prev_drift:-0}" ]; then
notable=1
fi
if [ "$notable" -eq 0 ]; then
append "Bewertung: Keine relevanten Aenderungen gegenueber gestern."
else
append "Bewertung: Relevante Aenderungen gegenueber gestern. Details bitte in den einzelnen Abschnitten pruefen."
fi
append ""
}
collect_self_health() {
append "## Self-Health"
append ""
local script_duration section_failures
script_duration=$(( $(date +%s) - SCRIPT_START ))
section_failures="$(count_lines < "$SECTION_ERRORS_FILE")"
set_summary "script_duration_seconds" "$script_duration"
set_summary "section_failures" "$section_failures"
append "- Skript-Laufzeit: $(format_duration "$script_duration") (${script_duration}s)"
append "- Sektionen mit Fehlern: $section_failures"
append "- Noise-Pattern-Datei vorhanden: $([ -f "$NOISE_PATTERNS_FILE" ] && echo ja || echo nein)"
append "- Lock-Datei: \`$LOCK_FILE\`"
if [ "$section_failures" -gt 0 ]; then
append ""
append "### Fehlerhafte Sektionen"
append ""
while IFS= read -r line; do
append "- $line"
done < "$SECTION_ERRORS_FILE"
fi
append ""
}
write_report() {
mkdir -p "$REPORT_DIR"
# shellcheck disable=SC1090
. "$SUMMARY_PATH"
{
printf '# Homelab Operations Report - %s\n\n' "$REPORT_DATE"
printf '%s\n' "- Erstellt: \`$(date -Iseconds)\`"
printf '%s\n' "- Zeitraum: letzte \`$SINCE\`"
printf '%s\n' "- Host: \`$(hostname)\`"
printf '%s\n\n' "- Gesamtbewertung: \`$REPORT_STATUS\`"
printf '## Executive Summary\n\n'
if [ "$REPORT_STATUS" = "OK" ]; then
printf 'Im betrachteten Zeitraum zeigt das Homelab eine stabile Betriebslage. Das letzte Borg-Backup ist erfolgreich abgeschlossen, Prometheus meldet keine firing Alerts, keine unhealthy Container, Zertifikate und Storage im erwarteten Bereich.\n\n'
elif [ "$REPORT_STATUS" = "WARNUNG" ]; then
printf 'Im betrachteten Zeitraum gibt es Punkte, die Aufmerksamkeit verdienen. Der Betrieb ist nicht automatisch als kompromittiert zu bewerten, aber mindestens ein Signal (Backup, Pending Alert, Zertifikat, Storage, Image-Alter, Drift oder Reboot) weicht vom Normalzustand ab.\n\n'
else
printf 'Im betrachteten Zeitraum liegt ein kritisches Betriebssignal vor. Der Bericht sollte zeitnah gelesen und die betroffenen Komponenten priorisiert geprueft werden.\n\n'
fi
printf '### Management-Bewertung\n\n'
printf '%s\n' "- Status: \`$REPORT_STATUS\`"
printf '%s\n' "- Borg Backup: \`${borg_status:-unknown}\`"
printf '%s\n' "- Backup-Dauer-Drift: \`${backup_duration_drift:-unknown}\`"
printf '%s\n' "- Prometheus Alerts (gesamt/firing/pending): \`${prometheus_alerts:-unknown}\` / \`${prometheus_alerts_firing:-unknown}\` / \`${prometheus_alerts_pending:-unknown}\`"
printf '%s\n' "- Container unhealthy: \`${containers_unhealthy:-unknown}\`"
printf '%s\n' "- Container exited non-zero: \`${containers_exited_nonzero:-unknown}\`"
printf '%s\n' "- Docker Critical Events: \`${docker_events:-unknown}\`"
printf '%s\n' "- Traefik 5xx: \`${traefik_5xx:-unknown}\`"
printf '%s\n' "- Zertifikatswarnungen: \`${cert_warnings:-unknown}\`"
printf '%s\n' "- Storage-Warnungen: \`${disk_warnings:-unknown}\`"
printf '%s\n' "- Image-Warnungen: \`${image_warnings:-unknown}\`"
printf '%s\n' "- Log-Highlights: \`${log_highlights:-unknown}\`"
printf '%s\n' "- Noise-Pattern ueber Schwelle: \`${noise_threshold_exceeded:-0}\`"
printf '%s\n' "- Log-Volumen gesamt: \`${log_volume_total:-unknown}\`"
printf '%s\n' "- Reboot in letzten 24h: \`${host_recent_boot:-unknown}\`"
printf '%s\n\n' "- Sektionsfehler im Skript: \`${section_failures:-unknown}\`"
printf '### Einordnung\n\n'
printf 'Dieser Report ist ein Management-Lagebericht: Er verdichtet Backup-Status, Container-Zustand, Monitoring-Alerts, Traefik-Fehler, Zertifikate, Storage, Image-Aktualitaet, Log-Volumen und Drift-Indikatoren. Rohlogs werden nur ausschnittsweise gezeigt, damit der Bericht lesbar bleibt und trotzdem nachvollziehbar ist.\n\n'
cat "$BODY_PATH"
printf '## Schlussbewertung\n\n'
if [ "$REPORT_STATUS" = "OK" ]; then
printf 'Das Homelab war im betrachteten Zeitraum betriebsfaehig und ohne akute Warnsignale. Es besteht aus diesem Report heraus kein unmittelbarer Handlungsdruck.\n'
elif [ "$REPORT_STATUS" = "WARNUNG" ]; then
printf 'Das Homelab war grundsaetzlich betriebsfaehig, zeigt aber mindestens eine Auffaelligkeit. Die im Bericht genannten Punkte sollten geprueft und bei Wiederholung nachverfolgt werden.\n'
else
printf 'Das Homelab zeigt ein kritisches Signal. Die betroffenen Dienste, Backup-Lage und firing Alerts sollten sofort geprueft werden.\n'
fi
} > "$REPORT_PATH.tmp"
mv "$REPORT_PATH.tmp" "$REPORT_PATH"
cp "$SUMMARY_PATH" "$PERSISTENT_SUMMARY_PATH.tmp"
mv "$PERSISTENT_SUMMARY_PATH.tmp" "$PERSISTENT_SUMMARY_PATH"
}
send_report_mail() {
[ "$SEND_MAIL" = "1" ] || return 0
[ -x "$MAIL_SCRIPT" ] || {
echo "Mail script missing or not executable: $MAIL_SCRIPT" >&2
record_section_error "mail" "Mail-Skript $MAIL_SCRIPT fehlt oder nicht ausfuehrbar"
return 1
}
case "$MAIL_MODE:$REPORT_STATUS" in
always:*|warning:WARNUNG|warning:KRITISCH|critical:KRITISCH)
"$MAIL_SCRIPT" "$REPORT_PATH" "$REPORT_STATUS"
;;
always:*|warning:*|critical:*)
# Mode bekannt, aber Status loest keinen Versand aus
;;
*)
echo "Unknown MAIL_MODE '$MAIL_MODE' - mail not sent. Use always|warning|critical." >&2
record_section_error "mail" "Unbekanntes MAIL_MODE '$MAIL_MODE'"
return 1
;;
esac
}
send_summary_ntfy() {
[ "$SEND_NTFY" = "1" ] || return 0
[ -x "$NTFY_SCRIPT" ] || return 0
# shellcheck disable=SC1090
. "$SUMMARY_PATH"
local title="Homelab Tagesprotokoll: ${REPORT_STATUS:-unknown} / borg=${borg_status:-unknown}"
local priority="default"
local body="Report: $REPORT_PATH
Status: $REPORT_STATUS
Container: ${containers_running:-?}/${containers_total:-?} running, unhealthy=${containers_unhealthy:-?}, exited_nonzero=${containers_exited_nonzero:-?}
Borg: ${borg_status:-unknown} (drift=${backup_duration_drift:-unknown})
Prometheus alerts (total/firing/pending): ${prometheus_alerts:-unknown}/${prometheus_alerts_firing:-unknown}/${prometheus_alerts_pending:-unknown}
Docker events: ${docker_events:-unknown}
Traefik 5xx: ${traefik_5xx:-unknown}
Certs warn: ${cert_warnings:-unknown}
Disk warn: ${disk_warnings:-unknown}
Image warn: ${image_warnings:-unknown}
Log highlights: ${log_highlights:-unknown}
Log volume: ${log_volume_total:-unknown}
Recent boot: ${host_recent_boot:-unknown}
Section errors: ${section_failures:-unknown}"
case "$REPORT_STATUS" in
KRITISCH) priority="urgent" ;;
WARNUNG) priority="high" ;;
*) priority="default" ;;
esac
"$NTFY_SCRIPT" "$NTFY_TOPIC" "$title" "$body" "$priority" || true
}
main() {
collect_overview
collect_host_health
collect_borg
collect_prometheus
collect_certificate_health
collect_disk_health
collect_image_freshness
collect_container_events
collect_container_state
collect_traefik_5xx
collect_log_highlights
collect_log_volume
collect_diff_yesterday
derive_report_status
collect_self_health
write_report
send_report_mail
send_summary_ntfy
printf '%s\n' "$REPORT_PATH"
}
main "$@"
case "$REPORT_STATUS" in
KRITISCH) exit 2 ;;
WARNUNG) exit 1 ;;
*) exit 0 ;;
esac