1202 lines
44 KiB
Bash
Executable File
1202 lines
44 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
SCRIPT_START="$(date +%s)"
|
|
|
|
REPORT_DIR="${REPORT_DIR:-/mnt/user/services/posture-check/daily-reports}"
|
|
REPORT_DATE="${REPORT_DATE:-$(date +%F)}"
|
|
REPORT_PATH="${REPORT_PATH:-$REPORT_DIR/homelab-day-$REPORT_DATE.md}"
|
|
PERSISTENT_SUMMARY_PATH="${PERSISTENT_SUMMARY_PATH:-$REPORT_DIR/summary-$REPORT_DATE.env}"
|
|
SINCE="${SINCE:-24h}"
|
|
MAX_LOG_LINES="${MAX_LOG_LINES:-80}"
|
|
CERT_MAX_ROWS="${CERT_MAX_ROWS:-12}"
|
|
IMAGE_AGE_WARN_DAYS="${IMAGE_AGE_WARN_DAYS:-180}"
|
|
LOG_VOLUME_TOP_N="${LOG_VOLUME_TOP_N:-10}"
|
|
DISK_USAGE_WARN_PCT="${DISK_USAGE_WARN_PCT:-85}"
|
|
CERT_WARN_DAYS="${CERT_WARN_DAYS:-21}"
|
|
BACKUP_DRIFT_FACTOR="${BACKUP_DRIFT_FACTOR:-2.0}"
|
|
SHOW_KNOWN_NOISE="${SHOW_KNOWN_NOISE:-0}"
|
|
SEND_MAIL="${SEND_MAIL:-0}"
|
|
MAIL_MODE="${MAIL_MODE:-always}"
|
|
MAIL_SCRIPT="${MAIL_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/send-operations-report-mail.sh}"
|
|
SEND_NTFY="${SEND_NTFY:-0}"
|
|
NTFY_TOPIC="${NTFY_TOPIC:-homelab-info}"
|
|
NTFY_SCRIPT="${NTFY_SCRIPT:-/mnt/user/services/homelab-infra/ops/restore-tests/send-ntfy.sh}"
|
|
BORG_CONTAINER="${BORG_CONTAINER:-borg-ui}"
|
|
PROMETHEUS_CONTAINER="${PROMETHEUS_CONTAINER:-monitoring-prometheus}"
|
|
TRAEFIK_ACME_PATH="${TRAEFIK_ACME_PATH:-/mnt/user/appdata/traefik/letsencrypt/acme.json}"
|
|
NOISE_PATTERNS_FILE="${NOISE_PATTERNS_FILE:-/mnt/user/services/homelab-infra/services/posture-check/log-noise.patterns}"
|
|
NORMALIZE_NOISE_SCRIPT="${NORMALIZE_NOISE_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/lib/normalize-noise-patterns.sh}"
|
|
NOISE_ESCALATION_THRESHOLD="${NOISE_ESCALATION_THRESHOLD:-500}"
|
|
NOISE_BREAKDOWN_TOP_N="${NOISE_BREAKDOWN_TOP_N:-10}"
|
|
POSTURE_CHECK_FILE="${POSTURE_CHECK_FILE:-/mnt/user/services/posture-check/last.json}"
|
|
LOCK_FILE="${LOCK_FILE:-/tmp/homelab-daily-report.lock}"
|
|
REPORT_STATUS="UNKNOWN"
|
|
|
|
exec 9>"$LOCK_FILE"
|
|
if ! flock -n 9; then
|
|
echo "Another daily-status-report run is already in progress (lock: $LOCK_FILE)" >&2
|
|
exit 3
|
|
fi
|
|
|
|
TMP_DIR="$(mktemp -d /tmp/homelab-daily-report.XXXXXX)"
|
|
BODY_PATH="$TMP_DIR/body.md"
|
|
SUMMARY_PATH="$TMP_DIR/summary.env"
|
|
SECTION_ERRORS_FILE="$TMP_DIR/section-errors.log"
|
|
: > "$BODY_PATH"
|
|
: > "$SUMMARY_PATH"
|
|
: > "$SECTION_ERRORS_FILE"
|
|
|
|
cleanup() {
|
|
rm -rf "$TMP_DIR"
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
append() {
|
|
printf '%s\n' "$*" >> "$BODY_PATH"
|
|
}
|
|
|
|
append_block() {
|
|
cat >> "$BODY_PATH"
|
|
}
|
|
|
|
set_summary() {
|
|
printf '%s=%s\n' "$1" "$2" >> "$SUMMARY_PATH"
|
|
}
|
|
|
|
record_section_error() {
|
|
printf '%s: %s\n' "$1" "$2" >> "$SECTION_ERRORS_FILE"
|
|
}
|
|
|
|
have_container() {
|
|
docker inspect "$1" >/dev/null 2>&1
|
|
}
|
|
|
|
count_lines() {
|
|
wc -l | awk '{ print $1 + 0 }'
|
|
}
|
|
|
|
shorten() {
|
|
sed -E 's/[[:space:]]+/ /g' | cut -c 1-260
|
|
}
|
|
|
|
format_duration() {
|
|
local s="${1:-0}"
|
|
if ! printf '%s' "$s" | grep -Eq '^[0-9]+$'; then
|
|
printf '?\n'
|
|
return
|
|
fi
|
|
local d=$(( s / 86400 ))
|
|
local h=$(( (s % 86400) / 3600 ))
|
|
local m=$(( (s % 3600) / 60 ))
|
|
local sec=$(( s % 60 ))
|
|
if [ "$d" -gt 0 ]; then
|
|
printf '%d Tage %d Stunden\n' "$d" "$h"
|
|
elif [ "$h" -gt 0 ]; then
|
|
printf '%d Stunden %d Minuten\n' "$h" "$m"
|
|
elif [ "$m" -gt 0 ]; then
|
|
printf '%d Minuten %d Sekunden\n' "$m" "$sec"
|
|
else
|
|
printf '%d Sekunden\n' "$sec"
|
|
fi
|
|
}
|
|
|
|
collect_overview() {
|
|
local running total unhealthy exited_nonzero
|
|
|
|
total="$(docker ps -a --format '{{.Names}}' | count_lines)"
|
|
running="$(docker ps --format '{{.Names}}' | count_lines)"
|
|
unhealthy="$(docker ps --filter health=unhealthy --format '{{.Names}}' | count_lines)"
|
|
exited_nonzero="$(docker ps -a --filter status=exited --format '{{.Names}} {{.Status}}' | awk '!/Exited \(0\)/ { count++ } END { print count + 0 }')"
|
|
|
|
set_summary "containers_total" "$total"
|
|
set_summary "containers_running" "$running"
|
|
set_summary "containers_unhealthy" "$unhealthy"
|
|
set_summary "containers_exited_nonzero" "$exited_nonzero"
|
|
|
|
append "## Betriebslage"
|
|
append ""
|
|
append "- Container: $running/$total laufen"
|
|
append "- Unhealthy Container: $unhealthy"
|
|
append "- Exited non-zero Container: $exited_nonzero"
|
|
|
|
if [ -f "$POSTURE_CHECK_FILE" ]; then
|
|
local posture_status posture_age now_epoch
|
|
posture_status="$(sed -n 's/.*"status": *"\([^"]*\)".*/\1/p' "$POSTURE_CHECK_FILE" | head -n 1)"
|
|
now_epoch="$(date +%s)"
|
|
posture_age=$(( now_epoch - $(stat -c %Y "$POSTURE_CHECK_FILE" 2>/dev/null || echo "$now_epoch") ))
|
|
append "- Letzter Posture-Check: ${posture_status:-unbekannt} (Datei ist $(format_duration "$posture_age") alt)"
|
|
set_summary "posture_status" "${posture_status:-unknown}"
|
|
set_summary "posture_age_seconds" "$posture_age"
|
|
else
|
|
append "- Letzter Posture-Check: keine Datei gefunden"
|
|
set_summary "posture_status" "missing"
|
|
record_section_error "overview" "Posture-Check-Datei $POSTURE_CHECK_FILE fehlt"
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_host_health() {
|
|
append "## Host"
|
|
append ""
|
|
|
|
local boot_epoch boot_iso uptime_seconds load_1 load_5 load_15 now_epoch
|
|
|
|
now_epoch="$(date +%s)"
|
|
boot_epoch="$(awk '/^btime/ { print $2 }' /proc/stat 2>/dev/null || echo 0)"
|
|
if [ "${boot_epoch:-0}" -gt 0 ]; then
|
|
boot_iso="$(date -u -d "@$boot_epoch" -Iseconds 2>/dev/null || echo unknown)"
|
|
uptime_seconds=$(( now_epoch - boot_epoch ))
|
|
else
|
|
boot_iso="unknown"
|
|
uptime_seconds=0
|
|
record_section_error "host" "/proc/stat btime nicht lesbar"
|
|
fi
|
|
if [ "$uptime_seconds" -lt 0 ]; then
|
|
uptime_seconds=0
|
|
fi
|
|
|
|
if [ -r /proc/loadavg ]; then
|
|
read -r load_1 load_5 load_15 _ < /proc/loadavg
|
|
else
|
|
load_1="?"; load_5="?"; load_15="?"
|
|
fi
|
|
|
|
append "- Hostname: \`$(hostname)\`"
|
|
append "- Boot-Zeit: \`$boot_iso\`"
|
|
append "- Uptime: $(format_duration "$uptime_seconds")"
|
|
append "- Load average (1/5/15): $load_1 / $load_5 / $load_15"
|
|
if [ "$uptime_seconds" -lt 86400 ]; then
|
|
append "- WARNUNG: Boot innerhalb der letzten 24 Stunden erkannt."
|
|
set_summary "host_recent_boot" "1"
|
|
else
|
|
append "- Reboot in den letzten 24h: nein"
|
|
set_summary "host_recent_boot" "0"
|
|
fi
|
|
|
|
set_summary "host_uptime_seconds" "$uptime_seconds"
|
|
set_summary "host_load_1" "$load_1"
|
|
append ""
|
|
}
|
|
|
|
derive_report_status() {
|
|
# shellcheck disable=SC1090
|
|
. "$SUMMARY_PATH"
|
|
|
|
REPORT_STATUS="OK"
|
|
local has_warn=0 has_crit=0
|
|
|
|
[ "${borg_status:-unknown}" != "completed" ] && has_warn=1
|
|
[ "${prometheus_alerts:-0}" = "unknown" ] && has_warn=1
|
|
[ "${cert_warnings:-0}" != "0" ] && has_warn=1
|
|
[ "${disk_warnings:-0}" != "0" ] && has_warn=1
|
|
[ "${image_warnings:-0}" != "0" ] && has_warn=1
|
|
[ "${containers_exited_nonzero:-0}" != "0" ] && has_warn=1
|
|
[ "${host_recent_boot:-0}" = "1" ] && has_warn=1
|
|
[ "${backup_duration_drift:-0}" = "1" ] && has_warn=1
|
|
[ "${noise_threshold_exceeded:-0}" != "0" ] && has_warn=1
|
|
if [ "${prometheus_alerts_pending:-0}" != "0" ] && [ "${prometheus_alerts_pending:-0}" != "unknown" ]; then
|
|
has_warn=1
|
|
fi
|
|
|
|
[ "${borg_status:-unknown}" = "failed" ] && has_crit=1
|
|
[ "${borg_status:-unknown}" = "error" ] && has_crit=1
|
|
[ "${containers_unhealthy:-0}" != "0" ] && has_crit=1
|
|
if [ "${prometheus_alerts_firing:-0}" != "0" ] && [ "${prometheus_alerts_firing:-0}" != "unknown" ]; then
|
|
has_crit=1
|
|
fi
|
|
|
|
if [ "$has_crit" -eq 1 ]; then
|
|
REPORT_STATUS="KRITISCH"
|
|
elif [ "$has_warn" -eq 1 ]; then
|
|
REPORT_STATUS="WARNUNG"
|
|
fi
|
|
|
|
set_summary "report_status" "$REPORT_STATUS"
|
|
}
|
|
|
|
collect_borg() {
|
|
append "## Borg Backup"
|
|
append ""
|
|
|
|
if ! have_container "$BORG_CONTAINER"; then
|
|
append "- WARNUNG: Container \`$BORG_CONTAINER\` nicht gefunden."
|
|
append ""
|
|
set_summary "borg_status" "unknown"
|
|
set_summary "backup_duration_drift" "unknown"
|
|
record_section_error "borg" "Container $BORG_CONTAINER nicht gefunden"
|
|
return
|
|
fi
|
|
|
|
if ! docker exec -i "$BORG_CONTAINER" python3 - <<'PY' >> "$BODY_PATH"
|
|
import sqlite3
|
|
|
|
def fmt_bytes(value):
|
|
if value is None:
|
|
return "-"
|
|
value = float(value)
|
|
units = ["B", "KB", "MB", "GB", "TB"]
|
|
for unit in units:
|
|
if value < 1024 or unit == units[-1]:
|
|
return f"{value:.1f} {unit}" if unit != "B" else f"{int(value)} B"
|
|
value /= 1024
|
|
|
|
def fmt_sec(s):
|
|
s = int(s)
|
|
h, rem = divmod(s, 3600)
|
|
m, sec = divmod(rem, 60)
|
|
if h > 0:
|
|
return f"{h}h {m}m"
|
|
return f"{m}m {sec}s"
|
|
|
|
conn = sqlite3.connect("/data/borg.db")
|
|
conn.row_factory = sqlite3.Row
|
|
cur = conn.cursor()
|
|
|
|
print("### Letzte Backup-Jobs")
|
|
rows = cur.execute("""
|
|
select id, status, started_at, completed_at, archive_name, nfiles,
|
|
original_size, compressed_size, deduplicated_size, error_message
|
|
from backup_jobs
|
|
where started_at >= datetime('now', '-30 hours')
|
|
or created_at >= datetime('now', '-30 hours')
|
|
order by coalesce(started_at, created_at) desc
|
|
limit 8
|
|
""").fetchall()
|
|
|
|
if not rows:
|
|
print("- WARNUNG: Kein Backup-Job in den letzten 30 Stunden gefunden.")
|
|
else:
|
|
print("| Zeit UTC | Status | Archiv | Dateien | Original | Dedupliziert |")
|
|
print("|---|---:|---|---:|---:|---:|")
|
|
for row in rows:
|
|
archive = row["archive_name"] or "-"
|
|
if len(archive) > 54:
|
|
archive = archive[:51] + "..."
|
|
print(
|
|
f"| {row['started_at'] or row['completed_at'] or '-'} "
|
|
f"| {row['status']} "
|
|
f"| {archive} "
|
|
f"| {row['nfiles'] if row['nfiles'] is not None else '-'} "
|
|
f"| {fmt_bytes(row['original_size'])} "
|
|
f"| {fmt_bytes(row['deduplicated_size'])} |"
|
|
)
|
|
if row["error_message"]:
|
|
print(f" - Fehler: {row['error_message'][:240]}")
|
|
|
|
print("")
|
|
print("### Zeitplan")
|
|
for row in cur.execute("""
|
|
select name, enabled, last_run, next_run, cron_expression
|
|
from scheduled_jobs
|
|
order by id
|
|
"""):
|
|
enabled = "aktiv" if row["enabled"] else "pausiert"
|
|
print(f"- {row['name']}: {enabled}, last={row['last_run'] or '-'}, next={row['next_run'] or '-'}, cron=`{row['cron_expression']}`")
|
|
|
|
print("")
|
|
print("### Dauer-Drift (Median 14 Tage)")
|
|
duration_rows = cur.execute("""
|
|
select started_at, completed_at,
|
|
(julianday(completed_at) - julianday(started_at)) * 86400 as duration_seconds
|
|
from backup_jobs
|
|
where status = 'completed'
|
|
and started_at is not null
|
|
and completed_at is not null
|
|
and completed_at >= datetime('now', '-14 days')
|
|
order by completed_at desc
|
|
""").fetchall()
|
|
|
|
durations = [r["duration_seconds"] for r in duration_rows if r["duration_seconds"] and r["duration_seconds"] > 0]
|
|
|
|
if len(durations) < 3:
|
|
print(f"- Zu wenig Datenpunkte fuer eine Drift-Bewertung (n={len(durations)}).")
|
|
else:
|
|
durations_sorted = sorted(durations)
|
|
median = durations_sorted[len(durations_sorted) // 2]
|
|
latest = durations[0]
|
|
ratio = latest / median if median > 0 else 0
|
|
print(f"- Letzter Lauf: {fmt_sec(latest)}")
|
|
print(f"- Median 14 Tage: {fmt_sec(median)} (n={len(durations)})")
|
|
print(f"- Verhaeltnis: {ratio:.2f}x")
|
|
if ratio > 2.0:
|
|
print(f"- Bewertung: Drift erkannt - letzter Lauf {ratio:.1f}x langsamer als der Median. Quellgroesse, IO und Repo-Zustand pruefen.")
|
|
else:
|
|
print("- Bewertung: Backup-Dauer im erwarteten Bereich.")
|
|
PY
|
|
then
|
|
append "- WARNUNG: Borg-Auswertung fehlgeschlagen."
|
|
set_summary "borg_status" "unknown"
|
|
set_summary "backup_duration_drift" "unknown"
|
|
record_section_error "borg" "Python-Auswertung in $BORG_CONTAINER fehlgeschlagen"
|
|
else
|
|
local borg_out borg_status borg_drift
|
|
borg_out="$(docker exec -i "$BORG_CONTAINER" python3 - <<'PY' 2>/dev/null || true
|
|
import sqlite3
|
|
conn = sqlite3.connect("/data/borg.db")
|
|
conn.row_factory = sqlite3.Row
|
|
cur = conn.cursor()
|
|
status_row = cur.execute("""
|
|
select status
|
|
from backup_jobs
|
|
order by coalesce(started_at, created_at) desc
|
|
limit 1
|
|
""").fetchone()
|
|
status = status_row[0] if status_row else "missing"
|
|
|
|
duration_rows = cur.execute("""
|
|
select (julianday(completed_at) - julianday(started_at)) * 86400 as ds
|
|
from backup_jobs
|
|
where status = 'completed'
|
|
and started_at is not null
|
|
and completed_at is not null
|
|
and completed_at >= datetime('now', '-14 days')
|
|
order by completed_at desc
|
|
""").fetchall()
|
|
durations = [r[0] for r in duration_rows if r[0] and r[0] > 0]
|
|
if len(durations) < 3:
|
|
drift = "insufficient"
|
|
else:
|
|
median = sorted(durations)[len(durations)//2]
|
|
latest = durations[0]
|
|
ratio = latest / median if median > 0 else 0
|
|
drift = "1" if ratio > 2.0 else "0"
|
|
|
|
print(f"status={status}")
|
|
print(f"drift={drift}")
|
|
PY
|
|
)"
|
|
borg_status="$(printf '%s' "$borg_out" | sed -n 's/^status=//p' | head -n 1)"
|
|
borg_drift="$(printf '%s' "$borg_out" | sed -n 's/^drift=//p' | head -n 1)"
|
|
if [ "${borg_drift:-}" = "1" ]; then
|
|
set_summary "backup_duration_drift" "1"
|
|
elif [ "${borg_drift:-}" = "0" ]; then
|
|
set_summary "backup_duration_drift" "0"
|
|
else
|
|
set_summary "backup_duration_drift" "unknown"
|
|
fi
|
|
set_summary "borg_status" "${borg_status:-unknown}"
|
|
fi
|
|
|
|
append ""
|
|
}
|
|
|
|
collect_prometheus() {
|
|
append "## Prometheus Alerts"
|
|
append ""
|
|
|
|
if ! have_container "$PROMETHEUS_CONTAINER"; then
|
|
append "- WARNUNG: Container \`$PROMETHEUS_CONTAINER\` nicht gefunden."
|
|
append ""
|
|
set_summary "prometheus_alerts" "unknown"
|
|
set_summary "prometheus_alerts_firing" "unknown"
|
|
set_summary "prometheus_alerts_pending" "unknown"
|
|
record_section_error "prometheus" "Container $PROMETHEUS_CONTAINER nicht gefunden"
|
|
return
|
|
fi
|
|
|
|
local alerts
|
|
alerts="$(docker exec "$PROMETHEUS_CONTAINER" wget -qO- http://localhost:9090/api/v1/alerts 2>/dev/null || true)"
|
|
if [ -z "$alerts" ]; then
|
|
append "- WARNUNG: Prometheus Alerts API nicht erreichbar."
|
|
set_summary "prometheus_alerts" "unknown"
|
|
set_summary "prometheus_alerts_firing" "unknown"
|
|
set_summary "prometheus_alerts_pending" "unknown"
|
|
record_section_error "prometheus" "Alerts-API leer oder nicht erreichbar"
|
|
elif printf '%s' "$alerts" | grep -q '"alerts":\[\]'; then
|
|
append "- Keine aktiven Alerts."
|
|
set_summary "prometheus_alerts" "0"
|
|
set_summary "prometheus_alerts_firing" "0"
|
|
set_summary "prometheus_alerts_pending" "0"
|
|
else
|
|
local total firing pending
|
|
total="$(printf '%s' "$alerts" | grep -o '"alertname":"[^"]*"' | count_lines)"
|
|
firing="$(printf '%s' "$alerts" | grep -o '"state":"firing"' | count_lines)"
|
|
pending="$(printf '%s' "$alerts" | grep -o '"state":"pending"' | count_lines)"
|
|
append "- Aktive Alerts insgesamt: $total"
|
|
append "- Davon firing: $firing"
|
|
append "- Davon pending: $pending"
|
|
append ""
|
|
append "### Details"
|
|
printf '%s' "$alerts" \
|
|
| grep -o '"alertname":"[^"]*"\|"severity":"[^"]*"\|"instance":"[^"]*"\|"service":"[^"]*"\|"state":"[^"]*"' \
|
|
| sed 's/^/ - /' >> "$BODY_PATH"
|
|
set_summary "prometheus_alerts" "$total"
|
|
set_summary "prometheus_alerts_firing" "$firing"
|
|
set_summary "prometheus_alerts_pending" "$pending"
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_certificate_health() {
|
|
append "## Zertifikate"
|
|
append ""
|
|
|
|
local cert_file="$TMP_DIR/certificates.tsv"
|
|
local cert_sorted="$TMP_DIR/certificates.sorted.tsv"
|
|
local warning_count=0
|
|
local total_count=0
|
|
: > "$cert_file"
|
|
|
|
if [ ! -f "$TRAEFIK_ACME_PATH" ]; then
|
|
append "- WARNUNG: Traefik ACME-Datei nicht gefunden: $TRAEFIK_ACME_PATH"
|
|
set_summary "cert_warnings" "1"
|
|
record_section_error "certificates" "ACME-Datei $TRAEFIK_ACME_PATH fehlt"
|
|
append ""
|
|
return
|
|
fi
|
|
|
|
if docker run -i --rm \
|
|
-v "$TRAEFIK_ACME_PATH:/acme.json:ro" \
|
|
python:3.13-alpine python - <<'PY' > "$cert_file"
|
|
import base64
|
|
import json
|
|
import ssl
|
|
import tempfile
|
|
from datetime import datetime, timezone
|
|
|
|
with open("/acme.json", "r", encoding="utf-8") as handle:
|
|
data = json.load(handle)
|
|
|
|
now = datetime.now(timezone.utc)
|
|
for resolver in data.values():
|
|
for cert in resolver.get("Certificates", []):
|
|
domain = cert.get("domain", {}).get("main") or "-"
|
|
sans = cert.get("domain", {}).get("sans") or []
|
|
cert_b64 = cert.get("certificate")
|
|
if not cert_b64:
|
|
continue
|
|
pem = base64.b64decode(cert_b64)
|
|
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
|
tmp.write(pem)
|
|
tmp_path = tmp.name
|
|
decoded = ssl._ssl._test_decode_cert(tmp_path)
|
|
not_after = datetime.strptime(decoded["notAfter"], "%b %d %H:%M:%S %Y %Z").replace(tzinfo=timezone.utc)
|
|
days = (not_after - now).days
|
|
names = ", ".join([domain, *sans])
|
|
print(f"{days}\t{not_after.date().isoformat()}\t{names}")
|
|
PY
|
|
then
|
|
if [ ! -s "$cert_file" ]; then
|
|
append "- WARNUNG: Keine Zertifikate in ACME-Datei gefunden."
|
|
warning_count=1
|
|
record_section_error "certificates" "ACME-Datei enthielt keine Zertifikate"
|
|
else
|
|
sort -n "$cert_file" > "$cert_sorted"
|
|
total_count="$(count_lines < "$cert_sorted")"
|
|
append "- Zertifikate gesamt: $total_count"
|
|
append "- Anzeige: die $CERT_MAX_ROWS Zertifikate mit der kuerzesten Restlaufzeit"
|
|
append "- Schwelle Warnung: weniger als $CERT_WARN_DAYS Tage"
|
|
append ""
|
|
append "| Resttage | Ablaufdatum UTC | Domains |"
|
|
append "|---:|---|---|"
|
|
while IFS="$(printf '\t')" read -r days expires domains; do
|
|
append "| $days | $expires | $domains |"
|
|
if [ "${days:-0}" -lt "$CERT_WARN_DAYS" ]; then
|
|
warning_count=$((warning_count + 1))
|
|
fi
|
|
done < <(head -n "$CERT_MAX_ROWS" "$cert_sorted")
|
|
while IFS="$(printf '\t')" read -r days _expires _domains; do
|
|
if [ "${days:-0}" -lt "$CERT_WARN_DAYS" ]; then
|
|
warning_count=$((warning_count + 1))
|
|
fi
|
|
done < <(tail -n +"$((CERT_MAX_ROWS + 1))" "$cert_sorted")
|
|
append ""
|
|
if [ "$warning_count" -eq 0 ]; then
|
|
append "Bewertung: Keine Zertifikate im kritischen Erneuerungsfenster unter $CERT_WARN_DAYS Tagen."
|
|
else
|
|
append "Bewertung: $warning_count Zertifikat(e) laufen in weniger als $CERT_WARN_DAYS Tagen ab und sollten beobachtet werden."
|
|
fi
|
|
fi
|
|
else
|
|
append "- WARNUNG: Zertifikate konnten nicht aus ACME-Datei gelesen werden."
|
|
warning_count=1
|
|
record_section_error "certificates" "Auswertung der ACME-Datei fehlgeschlagen"
|
|
fi
|
|
|
|
set_summary "cert_warnings" "$warning_count"
|
|
append ""
|
|
}
|
|
|
|
collect_disk_health() {
|
|
append "## Storage / Filesystem"
|
|
append ""
|
|
|
|
local disk_warnings=0
|
|
local paths="/mnt/cache /mnt/disk1 /mnt/user /mnt/user/appdata /mnt/user/backups"
|
|
|
|
append "- Schwelle Warnung: Nutzung ab ${DISK_USAGE_WARN_PCT}%"
|
|
append ""
|
|
append "| Pfad | Filesystem | Nutzung | Frei | Bewertung |"
|
|
append "|---|---|---:|---:|---|"
|
|
|
|
for path in $paths; do
|
|
if [ ! -e "$path" ]; then
|
|
append "| $path | - | - | - | fehlt |"
|
|
disk_warnings=$((disk_warnings + 1))
|
|
record_section_error "disk" "Kernpfad $path fehlt"
|
|
continue
|
|
fi
|
|
|
|
local fstype usage avail verdict
|
|
fstype="$(findmnt -T "$path" -no FSTYPE 2>/dev/null | head -n 1 || true)"
|
|
usage="$(df -P "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"
|
|
avail="$(df -hP "$path" 2>/dev/null | awk 'NR==2 { print $4 }')"
|
|
verdict="ok"
|
|
|
|
if ! printf '%s' "${usage:-}" | grep -Eq '^[0-9]+$'; then
|
|
usage="-"
|
|
verdict="unbekannt"
|
|
disk_warnings=$((disk_warnings + 1))
|
|
elif [ "$usage" -ge "$DISK_USAGE_WARN_PCT" ]; then
|
|
verdict="Warnung: >=${DISK_USAGE_WARN_PCT}%"
|
|
disk_warnings=$((disk_warnings + 1))
|
|
fi
|
|
|
|
append "| $path | ${fstype:-unbekannt} | ${usage}% | ${avail:-?} | $verdict |"
|
|
done
|
|
|
|
append ""
|
|
if [ "$disk_warnings" -eq 0 ]; then
|
|
append "Bewertung: Keine kritischen Fuellstaende oder fehlenden Kernpfade erkannt."
|
|
else
|
|
append "Bewertung: $disk_warnings Storage-/Filesystem-Punkt(e) brauchen Aufmerksamkeit."
|
|
fi
|
|
|
|
set_summary "disk_warnings" "$disk_warnings"
|
|
append ""
|
|
}
|
|
|
|
collect_image_freshness() {
|
|
append "## Image-Aktualitaet"
|
|
append ""
|
|
|
|
local image_file="$TMP_DIR/images.tsv"
|
|
local image_warnings=0
|
|
local now_epoch
|
|
: > "$image_file"
|
|
now_epoch="$(date +%s)"
|
|
|
|
while IFS= read -r name; do
|
|
[ -n "$name" ] || continue
|
|
local image_id created_iso created_epoch age_days image_tag
|
|
image_id="$(docker inspect --format '{{.Image}}' "$name" 2>/dev/null || true)"
|
|
[ -n "$image_id" ] || continue
|
|
created_iso="$(docker image inspect --format '{{.Created}}' "$image_id" 2>/dev/null || true)"
|
|
image_tag="$(docker inspect --format '{{.Config.Image}}' "$name" 2>/dev/null || echo '?')"
|
|
[ -n "$created_iso" ] || continue
|
|
created_epoch="$(date -d "$created_iso" +%s 2>/dev/null || echo 0)"
|
|
[ "$created_epoch" -gt 0 ] || continue
|
|
age_days=$(( (now_epoch - created_epoch) / 86400 ))
|
|
printf '%d\t%s\t%s\n' "$age_days" "$name" "$image_tag" >> "$image_file"
|
|
if [ "$age_days" -ge "$IMAGE_AGE_WARN_DAYS" ]; then
|
|
image_warnings=$((image_warnings + 1))
|
|
fi
|
|
done < <(docker ps --format '{{.Names}}')
|
|
|
|
set_summary "image_warnings" "$image_warnings"
|
|
|
|
if [ ! -s "$image_file" ]; then
|
|
append "- Keine Image-Daten verfuegbar."
|
|
record_section_error "images" "Keine Image-Daten ermittelt"
|
|
else
|
|
append "- Schwelle Warnung: Image aelter als $IMAGE_AGE_WARN_DAYS Tage"
|
|
append "- Container mit Image >= $IMAGE_AGE_WARN_DAYS Tage: $image_warnings"
|
|
append ""
|
|
append "### Aelteste Images (Top 10)"
|
|
append ""
|
|
append "| Alter Tage | Container | Image |"
|
|
append "|---:|---|---|"
|
|
sort -nr "$image_file" | head -n 10 | while IFS="$(printf '\t')" read -r age name img; do
|
|
append "| $age | $name | $img |"
|
|
done
|
|
append ""
|
|
if [ "$image_warnings" -eq 0 ]; then
|
|
append "Bewertung: Keine Container mit ueberalterten Images. CVE-Hygiene aus dieser Sicht ok."
|
|
else
|
|
append "Bewertung: $image_warnings Container nutzen Images aelter als $IMAGE_AGE_WARN_DAYS Tage. Update-Pipeline und CVE-Status pruefen."
|
|
fi
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_container_events() {
|
|
append "## Docker Events ($SINCE)"
|
|
append ""
|
|
|
|
local events_file="$TMP_DIR/docker-events.log"
|
|
timeout 20 docker events \
|
|
--since "$SINCE" \
|
|
--until "$(date -Iseconds)" \
|
|
--filter event=die \
|
|
--filter event=oom \
|
|
--filter event=kill \
|
|
--filter event=restart \
|
|
--format '{{.Time}}|{{.Actor.Attributes.name}}|{{.Action}}|{{.Actor.Attributes.exitCode}}|{{.Actor.Attributes.image}}' \
|
|
| awk -F '|' '!(($3 == "die") && ($4 == "0")) { print }' \
|
|
> "$events_file" 2>/dev/null || true
|
|
|
|
local event_count
|
|
event_count="$(count_lines < "$events_file")"
|
|
set_summary "docker_events" "$event_count"
|
|
|
|
if [ "$event_count" -eq 0 ]; then
|
|
append '- Keine `die`/`oom`/`kill`/`restart` Events im Zeitraum.'
|
|
else
|
|
append "- Relevante Events: $event_count"
|
|
append ""
|
|
append '```text'
|
|
tail -n 80 "$events_file" >> "$BODY_PATH"
|
|
append '```'
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_container_state() {
|
|
append "## Container-Zustand"
|
|
append ""
|
|
append "### Nicht laufende Container"
|
|
local stopped_file="$TMP_DIR/stopped.log"
|
|
docker ps -a --filter status=exited --filter status=dead --filter status=created --format '{{.Names}}\t{{.Status}}' > "$stopped_file"
|
|
if [ ! -s "$stopped_file" ]; then
|
|
append "- Keine."
|
|
else
|
|
append '```text'
|
|
cat "$stopped_file" >> "$BODY_PATH"
|
|
append '```'
|
|
fi
|
|
append ""
|
|
|
|
append "### Container mit RestartCount > 0"
|
|
local restart_file="$TMP_DIR/restarts.log"
|
|
: > "$restart_file"
|
|
while IFS= read -r name; do
|
|
[ -n "$name" ] || continue
|
|
local count
|
|
count="$(docker inspect "$name" --format '{{.RestartCount}}' 2>/dev/null || echo 0)"
|
|
if [ "${count:-0}" -gt 0 ]; then
|
|
printf '%s\t%s\n' "$name" "$count" >> "$restart_file"
|
|
fi
|
|
done < <(docker ps -a --format '{{.Names}}')
|
|
|
|
if [ ! -s "$restart_file" ]; then
|
|
append "- Keine."
|
|
else
|
|
append '```text'
|
|
sort -k2,2nr "$restart_file" >> "$BODY_PATH"
|
|
append '```'
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_traefik_5xx() {
|
|
append "## Traefik 5xx ($SINCE)"
|
|
append ""
|
|
|
|
if ! have_container traefik; then
|
|
append "- Traefik-Container nicht gefunden."
|
|
append ""
|
|
record_section_error "traefik" "Container traefik nicht gefunden"
|
|
return
|
|
fi
|
|
|
|
local file="$TMP_DIR/traefik-5xx.log"
|
|
docker logs --since "$SINCE" traefik 2>&1 \
|
|
| awk '$9 ~ /^5[0-9][0-9]$/ { print }' \
|
|
> "$file" || true
|
|
|
|
local count
|
|
count="$(count_lines < "$file")"
|
|
set_summary "traefik_5xx" "$count"
|
|
|
|
if [ "$count" -eq 0 ]; then
|
|
append "- Keine 5xx-Antworten."
|
|
else
|
|
append "- 5xx-Antworten: $count"
|
|
append ""
|
|
append "### Gruppiert nach Service/Code"
|
|
append '```text'
|
|
awk '{ code=$9; service=$12; gsub(/"/, "", service); counts[service " " code]++ } END { for (k in counts) print counts[k], k }' "$file" | sort -nr >> "$BODY_PATH"
|
|
append '```'
|
|
append ""
|
|
append "### Letzte Zeilen"
|
|
append '```text'
|
|
tail -n "$MAX_LOG_LINES" "$file" >> "$BODY_PATH"
|
|
append '```'
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_log_highlights() {
|
|
append "## Log-Auswertung ($SINCE)"
|
|
append ""
|
|
append "Ziel dieses Abschnitts ist nicht, Rohlogs zu wiederholen, sondern handlungsrelevante Auffaelligkeiten auszusortieren."
|
|
append ""
|
|
|
|
local hits="$TMP_DIR/log-hits.log"
|
|
local attention="$TMP_DIR/log-attention.log"
|
|
local known_noise="$TMP_DIR/log-known-noise.log"
|
|
: > "$hits"
|
|
: > "$attention"
|
|
: > "$known_noise"
|
|
|
|
while IFS= read -r name; do
|
|
[ -n "$name" ] || continue
|
|
docker logs --since "$SINCE" "$name" 2>&1 \
|
|
| grep -Eai 'error|fatal|panic|exception|failed|denied|unauthorized|forbidden|oom' \
|
|
| grep -Eavi 'level=info|levelname.: .INFO| 200 OK| 404 Not Found|healthcheck|probe_success' \
|
|
| grep -Eavi 'production.DEBUG|stats_refresh_scheduler.*errors.: 0|Sync completed.*Failed: 0' \
|
|
| sed -E 's/(refresh_token: )[A-Za-z0-9._-]+/\1[REDACTED]/Ig; s/(token: )[A-Za-z0-9._-]+/\1[REDACTED]/Ig; s/(Authorization: )[A-Za-z0-9._ -]+/\1[REDACTED]/Ig' \
|
|
| sed "s/^/[$name] /" >> "$hits" || true
|
|
done < <(docker ps --format '{{.Names}}')
|
|
|
|
# Normalize the noise pattern file (drop comments, empty lines, trim
|
|
# whitespace). An empty or whitespace-only pattern line would otherwise
|
|
# make grep -Eaif match every hit and silently wipe the log highlights.
|
|
local noise_normalized="$TMP_DIR/noise.patterns.normalized"
|
|
: > "$noise_normalized"
|
|
if [ -f "$NOISE_PATTERNS_FILE" ]; then
|
|
if [ -x "$NORMALIZE_NOISE_SCRIPT" ]; then
|
|
"$NORMALIZE_NOISE_SCRIPT" "$NOISE_PATTERNS_FILE" > "$noise_normalized" 2>/dev/null || : > "$noise_normalized"
|
|
else
|
|
record_section_error "log-highlights" "Normalize-Helper fehlt oder nicht ausfuehrbar: $NORMALIZE_NOISE_SCRIPT - Noise-Patterns ungenormt verwendet"
|
|
# Fallback inline (same logic as the helper) so we still avoid the
|
|
# "empty line matches all" trap.
|
|
grep -Ev '^[[:space:]]*(#|$)' "$NOISE_PATTERNS_FILE" 2>/dev/null \
|
|
| sed -E 's/^[[:space:]]+//; s/[[:space:]]+$//' \
|
|
| grep -v '^$' > "$noise_normalized" || : > "$noise_normalized"
|
|
fi
|
|
else
|
|
record_section_error "log-highlights" "Noise-Pattern-Datei $NOISE_PATTERNS_FILE fehlt - alle Treffer gelten als handlungsrelevant"
|
|
fi
|
|
|
|
if [ -s "$hits" ]; then
|
|
if [ -s "$noise_normalized" ]; then
|
|
grep -Eaif "$noise_normalized" "$hits" > "$known_noise" || true
|
|
fi
|
|
if [ -s "$known_noise" ]; then
|
|
# Normalisierung gegen abweichende Whitespace-Enden
|
|
sed -E 's/[[:space:]]+$//' "$known_noise" > "$known_noise.norm"
|
|
sed -E 's/[[:space:]]+$//' "$hits" > "$hits.norm"
|
|
grep -Fvxf "$known_noise.norm" "$hits.norm" > "$attention" || true
|
|
else
|
|
cp "$hits" "$attention"
|
|
fi
|
|
fi
|
|
|
|
# Per-container noise breakdown (always computed, even if SHOW_KNOWN_NOISE=0).
|
|
local noise_by_container="$TMP_DIR/noise-by-container.tsv"
|
|
: > "$noise_by_container"
|
|
if [ -s "$known_noise" ]; then
|
|
awk -F '[][]' '{ counts[$2]++ } END { for (n in counts) print counts[n] "\t" n }' "$known_noise" \
|
|
| sort -nr > "$noise_by_container"
|
|
fi
|
|
|
|
# Per-pattern noise breakdown: count how often each pattern hit in $hits.
|
|
# Note: a single hit line may match multiple patterns; counts can overlap.
|
|
local noise_by_pattern="$TMP_DIR/noise-by-pattern.tsv"
|
|
: > "$noise_by_pattern"
|
|
if [ -s "$noise_normalized" ] && [ -s "$hits" ]; then
|
|
while IFS= read -r p; do
|
|
[ -n "$p" ] || continue
|
|
local pcount
|
|
pcount="$(grep -Eaic -- "$p" "$hits" 2>/dev/null || true)"
|
|
if [ "${pcount:-0}" -gt 0 ]; then
|
|
printf '%d\t%s\n' "$pcount" "$p" >> "$noise_by_pattern"
|
|
fi
|
|
done < "$noise_normalized"
|
|
if [ -s "$noise_by_pattern" ]; then
|
|
sort -nr -o "$noise_by_pattern" "$noise_by_pattern"
|
|
fi
|
|
fi
|
|
|
|
# Threshold escalation: how many patterns produced more than the threshold?
|
|
local noise_threshold_exceeded=0
|
|
if [ -s "$noise_by_pattern" ]; then
|
|
noise_threshold_exceeded="$(awk -v t="$NOISE_ESCALATION_THRESHOLD" '$1 > t { n++ } END { print n + 0 }' "$noise_by_pattern")"
|
|
fi
|
|
set_summary "noise_threshold_exceeded" "$noise_threshold_exceeded"
|
|
|
|
local hit_count attention_count known_noise_count
|
|
hit_count="$(count_lines < "$hits")"
|
|
attention_count="$(count_lines < "$attention")"
|
|
known_noise_count="$(count_lines < "$known_noise")"
|
|
set_summary "log_highlights" "$attention_count"
|
|
set_summary "log_hits_total" "$hit_count"
|
|
set_summary "log_known_noise" "$known_noise_count"
|
|
|
|
if [ "$hit_count" -eq 0 ]; then
|
|
append "- Keine auffaelligen Logmuster gefunden."
|
|
else
|
|
append "- Gefundene Logmuster insgesamt: $hit_count"
|
|
append "- Davon als bekanntes Rauschen eingeordnet: $known_noise_count"
|
|
append "- Handlungsrelevante Logmuster: $attention_count"
|
|
append "- Noise-Pattern-Quelle: \`$NOISE_PATTERNS_FILE\`"
|
|
append "- Eskalations-Schwelle pro Pattern: $NOISE_ESCALATION_THRESHOLD"
|
|
if [ "$noise_threshold_exceeded" -gt 0 ]; then
|
|
append "- WARNUNG: $noise_threshold_exceeded Pattern ueberschreit(en) die Schwelle - bitte pruefen ob noch wirklich Noise."
|
|
fi
|
|
append ""
|
|
|
|
if [ "$attention_count" -eq 0 ]; then
|
|
append "Bewertung: Keine handlungsrelevanten Logmuster. Die Treffer bestehen aus bekannten, aktuell nicht kritischen Meldungen."
|
|
else
|
|
append "Bewertung: Es gibt Logmuster, die nicht automatisch als bekanntes Rauschen eingeordnet wurden. Diese sollten geprueft werden."
|
|
append ""
|
|
append "### Betroffene Container"
|
|
append ""
|
|
append "| Container | Anzahl |"
|
|
append "|---|---:|"
|
|
awk -F '[][]' '{ counts[$2]++ } END { for (name in counts) print "| " name " | " counts[name] " |" }' "$attention" | sort >> "$BODY_PATH"
|
|
append ""
|
|
append "### Beispiele"
|
|
append ""
|
|
append '```text'
|
|
awk -F '[][]' '
|
|
{
|
|
name=$2
|
|
if (seen[name] < 3) {
|
|
line=$0
|
|
gsub(/[[:space:]]+/, " ", line)
|
|
if (length(line) > 220) line=substr(line, 1, 217) "..."
|
|
print line
|
|
seen[name]++
|
|
}
|
|
}
|
|
' "$attention" | head -n "$MAX_LOG_LINES" >> "$BODY_PATH"
|
|
append '```'
|
|
fi
|
|
|
|
if [ "$known_noise_count" -gt 0 ]; then
|
|
append ""
|
|
append "### Bekanntes Rauschen (Top)"
|
|
append ""
|
|
if [ -s "$noise_by_container" ]; then
|
|
append "#### Container mit den meisten Noise-Treffern"
|
|
append ""
|
|
append "| Container | Anzahl |"
|
|
append "|---|---:|"
|
|
head -n "$NOISE_BREAKDOWN_TOP_N" "$noise_by_container" \
|
|
| while IFS="$(printf '\t')" read -r cnt cname; do
|
|
append "| ${cname:-?} | $cnt |"
|
|
done
|
|
append ""
|
|
fi
|
|
if [ -s "$noise_by_pattern" ]; then
|
|
append "#### Pattern mit den meisten Treffern"
|
|
append ""
|
|
append "| Pattern | Anzahl |"
|
|
append "|---|---:|"
|
|
head -n "$NOISE_BREAKDOWN_TOP_N" "$noise_by_pattern" \
|
|
| while IFS="$(printf '\t')" read -r cnt pat; do
|
|
local short="$pat"
|
|
if [ "${#short}" -gt 80 ]; then
|
|
short="${short:0:77}..."
|
|
fi
|
|
# Escape pipe characters that would break the markdown table.
|
|
short="${short//|/\\|}"
|
|
append "| \`$short\` | $cnt |"
|
|
done
|
|
append ""
|
|
fi
|
|
if [ "$noise_threshold_exceeded" -gt 0 ]; then
|
|
append "Bewertung: $noise_threshold_exceeded Pattern ueberschreit(en) die Eskalations-Schwelle ($NOISE_ESCALATION_THRESHOLD). Bitte pruefen, ob die als Noise eingeordneten Meldungen noch fachlich Noise sind oder ob sich ein echter Vorfall darunter versteckt."
|
|
else
|
|
append "Bewertung: Kein Pattern ueberschreitet die Eskalations-Schwelle ($NOISE_ESCALATION_THRESHOLD)."
|
|
fi
|
|
fi
|
|
|
|
if [ "$known_noise_count" -gt 0 ] && [ "$SHOW_KNOWN_NOISE" = "1" ]; then
|
|
append ""
|
|
append "### Ausgeblendetes bekanntes Rauschen (Top 50 Zeilen)"
|
|
append ""
|
|
append '```text'
|
|
head -n 50 "$known_noise" >> "$BODY_PATH"
|
|
append '```'
|
|
fi
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_log_volume() {
|
|
append "## Log-Volumen ($SINCE)"
|
|
append ""
|
|
|
|
local volume_file="$TMP_DIR/log-volume.tsv"
|
|
: > "$volume_file"
|
|
|
|
while IFS= read -r name; do
|
|
[ -n "$name" ] || continue
|
|
local count
|
|
count="$(docker logs --since "$SINCE" "$name" 2>&1 | count_lines)"
|
|
printf '%d\t%s\n' "$count" "$name" >> "$volume_file"
|
|
done < <(docker ps --format '{{.Names}}')
|
|
|
|
local total
|
|
total="$(awk '{ s += $1 } END { print s + 0 }' "$volume_file")"
|
|
set_summary "log_volume_total" "$total"
|
|
|
|
if [ "$total" -eq 0 ]; then
|
|
append "- Keine Logzeilen im Zeitraum (unwahrscheinlich, evtl. Datenquelle pruefen)."
|
|
record_section_error "log-volume" "Log-Volumen ueber alle Container ist 0"
|
|
else
|
|
append "- Zeilen insgesamt im Zeitraum: $total"
|
|
append ""
|
|
append "### Top $LOG_VOLUME_TOP_N lauteste Container"
|
|
append ""
|
|
append "| Container | Zeilen |"
|
|
append "|---|---:|"
|
|
sort -nr "$volume_file" | head -n "$LOG_VOLUME_TOP_N" | while IFS="$(printf '\t')" read -r c n; do
|
|
append "| $n | $c |"
|
|
done
|
|
append ""
|
|
append "Bewertung: Auffaellig laute Container sind oft ein Frueh-Indikator fuer Endlosschleifen, schlecht konfigurierte Loglevel oder Probe-Spam."
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_diff_yesterday() {
|
|
append "## Vergleich mit gestern"
|
|
append ""
|
|
|
|
local yesterday yesterday_summary
|
|
yesterday="$(date -d 'yesterday' +%F 2>/dev/null || true)"
|
|
yesterday_summary="$REPORT_DIR/summary-$yesterday.env"
|
|
|
|
if [ -z "$yesterday" ] || [ ! -f "$yesterday_summary" ]; then
|
|
append "- Keine Vortagsdaten verfuegbar ($yesterday_summary)."
|
|
append ""
|
|
return
|
|
fi
|
|
|
|
local prev_borg= prev_alerts= prev_firing= prev_pending= prev_unhealthy= prev_exited= prev_5xx= prev_events= prev_log= prev_certs= prev_disk= prev_img= prev_drift= prev_vol=
|
|
while IFS='=' read -r key value; do
|
|
case "$key" in
|
|
borg_status) prev_borg="$value" ;;
|
|
prometheus_alerts) prev_alerts="$value" ;;
|
|
prometheus_alerts_firing) prev_firing="$value" ;;
|
|
prometheus_alerts_pending) prev_pending="$value" ;;
|
|
containers_unhealthy) prev_unhealthy="$value" ;;
|
|
containers_exited_nonzero) prev_exited="$value" ;;
|
|
traefik_5xx) prev_5xx="$value" ;;
|
|
docker_events) prev_events="$value" ;;
|
|
log_highlights) prev_log="$value" ;;
|
|
cert_warnings) prev_certs="$value" ;;
|
|
disk_warnings) prev_disk="$value" ;;
|
|
image_warnings) prev_img="$value" ;;
|
|
backup_duration_drift) prev_drift="$value" ;;
|
|
log_volume_total) prev_vol="$value" ;;
|
|
esac
|
|
done < "$yesterday_summary"
|
|
|
|
# shellcheck disable=SC1090
|
|
. "$SUMMARY_PATH"
|
|
|
|
append "Vergleich des Datums $REPORT_DATE mit $yesterday."
|
|
append ""
|
|
append "| Metrik | Heute | Gestern |"
|
|
append "|---|---:|---:|"
|
|
append "| Borg Status | ${borg_status:-?} | ${prev_borg:-?} |"
|
|
append "| Prometheus Alerts gesamt | ${prometheus_alerts:-?} | ${prev_alerts:-?} |"
|
|
append "| Prometheus firing | ${prometheus_alerts_firing:-?} | ${prev_firing:-?} |"
|
|
append "| Prometheus pending | ${prometheus_alerts_pending:-?} | ${prev_pending:-?} |"
|
|
append "| Container unhealthy | ${containers_unhealthy:-?} | ${prev_unhealthy:-?} |"
|
|
append "| Container exited non-zero | ${containers_exited_nonzero:-?} | ${prev_exited:-?} |"
|
|
append "| Docker Events | ${docker_events:-?} | ${prev_events:-?} |"
|
|
append "| Traefik 5xx | ${traefik_5xx:-?} | ${prev_5xx:-?} |"
|
|
append "| Log-Highlights | ${log_highlights:-?} | ${prev_log:-?} |"
|
|
append "| Log-Volumen | ${log_volume_total:-?} | ${prev_vol:-?} |"
|
|
append "| Zertifikatswarnungen | ${cert_warnings:-?} | ${prev_certs:-?} |"
|
|
append "| Storage-Warnungen | ${disk_warnings:-?} | ${prev_disk:-?} |"
|
|
append "| Image-Warnungen | ${image_warnings:-?} | ${prev_img:-?} |"
|
|
append "| Backup-Dauer-Drift | ${backup_duration_drift:-?} | ${prev_drift:-?} |"
|
|
append ""
|
|
|
|
local notable=0
|
|
if [ "${containers_exited_nonzero:-0}" != "${prev_exited:-0}" ] || \
|
|
[ "${containers_unhealthy:-0}" != "${prev_unhealthy:-0}" ] || \
|
|
[ "${prometheus_alerts_firing:-0}" != "${prev_firing:-0}" ] || \
|
|
[ "${prometheus_alerts_pending:-0}" != "${prev_pending:-0}" ] || \
|
|
[ "${log_highlights:-0}" != "${prev_log:-0}" ] || \
|
|
[ "${borg_status:-unknown}" != "${prev_borg:-unknown}" ] || \
|
|
[ "${backup_duration_drift:-0}" != "${prev_drift:-0}" ]; then
|
|
notable=1
|
|
fi
|
|
|
|
if [ "$notable" -eq 0 ]; then
|
|
append "Bewertung: Keine relevanten Aenderungen gegenueber gestern."
|
|
else
|
|
append "Bewertung: Relevante Aenderungen gegenueber gestern. Details bitte in den einzelnen Abschnitten pruefen."
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
collect_self_health() {
|
|
append "## Self-Health"
|
|
append ""
|
|
|
|
local script_duration section_failures
|
|
script_duration=$(( $(date +%s) - SCRIPT_START ))
|
|
section_failures="$(count_lines < "$SECTION_ERRORS_FILE")"
|
|
|
|
set_summary "script_duration_seconds" "$script_duration"
|
|
set_summary "section_failures" "$section_failures"
|
|
|
|
append "- Skript-Laufzeit: $(format_duration "$script_duration") (${script_duration}s)"
|
|
append "- Sektionen mit Fehlern: $section_failures"
|
|
append "- Noise-Pattern-Datei vorhanden: $([ -f "$NOISE_PATTERNS_FILE" ] && echo ja || echo nein)"
|
|
append "- Lock-Datei: \`$LOCK_FILE\`"
|
|
|
|
if [ "$section_failures" -gt 0 ]; then
|
|
append ""
|
|
append "### Fehlerhafte Sektionen"
|
|
append ""
|
|
while IFS= read -r line; do
|
|
append "- $line"
|
|
done < "$SECTION_ERRORS_FILE"
|
|
fi
|
|
append ""
|
|
}
|
|
|
|
write_report() {
|
|
mkdir -p "$REPORT_DIR"
|
|
|
|
# shellcheck disable=SC1090
|
|
. "$SUMMARY_PATH"
|
|
|
|
{
|
|
printf '# Homelab Operations Report - %s\n\n' "$REPORT_DATE"
|
|
printf '%s\n' "- Erstellt: \`$(date -Iseconds)\`"
|
|
printf '%s\n' "- Zeitraum: letzte \`$SINCE\`"
|
|
printf '%s\n' "- Host: \`$(hostname)\`"
|
|
printf '%s\n\n' "- Gesamtbewertung: \`$REPORT_STATUS\`"
|
|
printf '## Executive Summary\n\n'
|
|
if [ "$REPORT_STATUS" = "OK" ]; then
|
|
printf 'Im betrachteten Zeitraum zeigt das Homelab eine stabile Betriebslage. Das letzte Borg-Backup ist erfolgreich abgeschlossen, Prometheus meldet keine firing Alerts, keine unhealthy Container, Zertifikate und Storage im erwarteten Bereich.\n\n'
|
|
elif [ "$REPORT_STATUS" = "WARNUNG" ]; then
|
|
printf 'Im betrachteten Zeitraum gibt es Punkte, die Aufmerksamkeit verdienen. Der Betrieb ist nicht automatisch als kompromittiert zu bewerten, aber mindestens ein Signal (Backup, Pending Alert, Zertifikat, Storage, Image-Alter, Drift oder Reboot) weicht vom Normalzustand ab.\n\n'
|
|
else
|
|
printf 'Im betrachteten Zeitraum liegt ein kritisches Betriebssignal vor. Der Bericht sollte zeitnah gelesen und die betroffenen Komponenten priorisiert geprueft werden.\n\n'
|
|
fi
|
|
printf '### Management-Bewertung\n\n'
|
|
printf '%s\n' "- Status: \`$REPORT_STATUS\`"
|
|
printf '%s\n' "- Borg Backup: \`${borg_status:-unknown}\`"
|
|
printf '%s\n' "- Backup-Dauer-Drift: \`${backup_duration_drift:-unknown}\`"
|
|
printf '%s\n' "- Prometheus Alerts (gesamt/firing/pending): \`${prometheus_alerts:-unknown}\` / \`${prometheus_alerts_firing:-unknown}\` / \`${prometheus_alerts_pending:-unknown}\`"
|
|
printf '%s\n' "- Container unhealthy: \`${containers_unhealthy:-unknown}\`"
|
|
printf '%s\n' "- Container exited non-zero: \`${containers_exited_nonzero:-unknown}\`"
|
|
printf '%s\n' "- Docker Critical Events: \`${docker_events:-unknown}\`"
|
|
printf '%s\n' "- Traefik 5xx: \`${traefik_5xx:-unknown}\`"
|
|
printf '%s\n' "- Zertifikatswarnungen: \`${cert_warnings:-unknown}\`"
|
|
printf '%s\n' "- Storage-Warnungen: \`${disk_warnings:-unknown}\`"
|
|
printf '%s\n' "- Image-Warnungen: \`${image_warnings:-unknown}\`"
|
|
printf '%s\n' "- Log-Highlights: \`${log_highlights:-unknown}\`"
|
|
printf '%s\n' "- Noise-Pattern ueber Schwelle: \`${noise_threshold_exceeded:-0}\`"
|
|
printf '%s\n' "- Log-Volumen gesamt: \`${log_volume_total:-unknown}\`"
|
|
printf '%s\n' "- Reboot in letzten 24h: \`${host_recent_boot:-unknown}\`"
|
|
printf '%s\n\n' "- Sektionsfehler im Skript: \`${section_failures:-unknown}\`"
|
|
printf '### Einordnung\n\n'
|
|
printf 'Dieser Report ist ein Management-Lagebericht: Er verdichtet Backup-Status, Container-Zustand, Monitoring-Alerts, Traefik-Fehler, Zertifikate, Storage, Image-Aktualitaet, Log-Volumen und Drift-Indikatoren. Rohlogs werden nur ausschnittsweise gezeigt, damit der Bericht lesbar bleibt und trotzdem nachvollziehbar ist.\n\n'
|
|
cat "$BODY_PATH"
|
|
printf '## Schlussbewertung\n\n'
|
|
if [ "$REPORT_STATUS" = "OK" ]; then
|
|
printf 'Das Homelab war im betrachteten Zeitraum betriebsfaehig und ohne akute Warnsignale. Es besteht aus diesem Report heraus kein unmittelbarer Handlungsdruck.\n'
|
|
elif [ "$REPORT_STATUS" = "WARNUNG" ]; then
|
|
printf 'Das Homelab war grundsaetzlich betriebsfaehig, zeigt aber mindestens eine Auffaelligkeit. Die im Bericht genannten Punkte sollten geprueft und bei Wiederholung nachverfolgt werden.\n'
|
|
else
|
|
printf 'Das Homelab zeigt ein kritisches Signal. Die betroffenen Dienste, Backup-Lage und firing Alerts sollten sofort geprueft werden.\n'
|
|
fi
|
|
} > "$REPORT_PATH.tmp"
|
|
mv "$REPORT_PATH.tmp" "$REPORT_PATH"
|
|
|
|
cp "$SUMMARY_PATH" "$PERSISTENT_SUMMARY_PATH.tmp"
|
|
mv "$PERSISTENT_SUMMARY_PATH.tmp" "$PERSISTENT_SUMMARY_PATH"
|
|
}
|
|
|
|
send_report_mail() {
|
|
[ "$SEND_MAIL" = "1" ] || return 0
|
|
[ -x "$MAIL_SCRIPT" ] || {
|
|
echo "Mail script missing or not executable: $MAIL_SCRIPT" >&2
|
|
record_section_error "mail" "Mail-Skript $MAIL_SCRIPT fehlt oder nicht ausfuehrbar"
|
|
return 1
|
|
}
|
|
|
|
case "$MAIL_MODE:$REPORT_STATUS" in
|
|
always:*|warning:WARNUNG|warning:KRITISCH|critical:KRITISCH)
|
|
"$MAIL_SCRIPT" "$REPORT_PATH" "$REPORT_STATUS"
|
|
;;
|
|
always:*|warning:*|critical:*)
|
|
# Mode bekannt, aber Status loest keinen Versand aus
|
|
;;
|
|
*)
|
|
echo "Unknown MAIL_MODE '$MAIL_MODE' - mail not sent. Use always|warning|critical." >&2
|
|
record_section_error "mail" "Unbekanntes MAIL_MODE '$MAIL_MODE'"
|
|
return 1
|
|
;;
|
|
esac
|
|
}
|
|
|
|
send_summary_ntfy() {
|
|
[ "$SEND_NTFY" = "1" ] || return 0
|
|
[ -x "$NTFY_SCRIPT" ] || return 0
|
|
|
|
# shellcheck disable=SC1090
|
|
. "$SUMMARY_PATH"
|
|
|
|
local title="Homelab Tagesprotokoll: ${REPORT_STATUS:-unknown} / borg=${borg_status:-unknown}"
|
|
local priority="default"
|
|
local body="Report: $REPORT_PATH
|
|
Status: $REPORT_STATUS
|
|
Container: ${containers_running:-?}/${containers_total:-?} running, unhealthy=${containers_unhealthy:-?}, exited_nonzero=${containers_exited_nonzero:-?}
|
|
Borg: ${borg_status:-unknown} (drift=${backup_duration_drift:-unknown})
|
|
Prometheus alerts (total/firing/pending): ${prometheus_alerts:-unknown}/${prometheus_alerts_firing:-unknown}/${prometheus_alerts_pending:-unknown}
|
|
Docker events: ${docker_events:-unknown}
|
|
Traefik 5xx: ${traefik_5xx:-unknown}
|
|
Certs warn: ${cert_warnings:-unknown}
|
|
Disk warn: ${disk_warnings:-unknown}
|
|
Image warn: ${image_warnings:-unknown}
|
|
Log highlights: ${log_highlights:-unknown}
|
|
Log volume: ${log_volume_total:-unknown}
|
|
Recent boot: ${host_recent_boot:-unknown}
|
|
Section errors: ${section_failures:-unknown}"
|
|
|
|
case "$REPORT_STATUS" in
|
|
KRITISCH) priority="urgent" ;;
|
|
WARNUNG) priority="high" ;;
|
|
*) priority="default" ;;
|
|
esac
|
|
|
|
"$NTFY_SCRIPT" "$NTFY_TOPIC" "$title" "$body" "$priority" || true
|
|
}
|
|
|
|
main() {
|
|
collect_overview
|
|
collect_host_health
|
|
collect_borg
|
|
collect_prometheus
|
|
collect_certificate_health
|
|
collect_disk_health
|
|
collect_image_freshness
|
|
collect_container_events
|
|
collect_container_state
|
|
collect_traefik_5xx
|
|
collect_log_highlights
|
|
collect_log_volume
|
|
collect_diff_yesterday
|
|
derive_report_status
|
|
collect_self_health
|
|
write_report
|
|
send_report_mail
|
|
send_summary_ntfy
|
|
|
|
printf '%s\n' "$REPORT_PATH"
|
|
}
|
|
|
|
main "$@"
|
|
|
|
case "$REPORT_STATUS" in
|
|
KRITISCH) exit 2 ;;
|
|
WARNUNG) exit 1 ;;
|
|
*) exit 0 ;;
|
|
esac
|