Fix operations report warnings
This commit is contained in:
@@ -13,6 +13,7 @@ CERT_MAX_ROWS="${CERT_MAX_ROWS:-12}"
|
||||
IMAGE_AGE_WARN_DAYS="${IMAGE_AGE_WARN_DAYS:-180}"
|
||||
IMAGE_AGE_ALLOW_FILE="${IMAGE_AGE_ALLOW_FILE:-/mnt/user/services/homelab-infra/services/posture-check/image-age-allow.patterns}"
|
||||
LOG_VOLUME_TOP_N="${LOG_VOLUME_TOP_N:-10}"
|
||||
LOG_VOLUME_OBSERVE_THRESHOLD="${LOG_VOLUME_OBSERVE_THRESHOLD:-100000}"
|
||||
DISK_USAGE_WARN_PCT="${DISK_USAGE_WARN_PCT:-85}"
|
||||
CERT_WARN_DAYS="${CERT_WARN_DAYS:-21}"
|
||||
BACKUP_DRIFT_FACTOR="${BACKUP_DRIFT_FACTOR:-2.0}"
|
||||
@@ -217,6 +218,73 @@ derive_report_status() {
|
||||
set_summary "report_status" "$REPORT_STATUS"
|
||||
}
|
||||
|
||||
print_status_reasons() {
|
||||
local count=0
|
||||
|
||||
add_reason() {
|
||||
printf '%s\n' "- $1"
|
||||
count=$((count + 1))
|
||||
}
|
||||
|
||||
[ "${borg_status:-unknown}" != "completed" ] && add_reason "Borg Backup ist \`${borg_status:-unknown}\` statt \`completed\`."
|
||||
[ "${prometheus_alerts:-0}" = "unknown" ] && add_reason "Prometheus Alerts konnten nicht sicher gelesen werden."
|
||||
[ "${cert_warnings:-0}" != "0" ] && add_reason "Zertifikatswarnungen: \`${cert_warnings:-0}\`."
|
||||
[ "${disk_warnings:-0}" != "0" ] && add_reason "Storage-Warnungen: \`${disk_warnings:-0}\`."
|
||||
if [ "${image_warnings:-0}" != "0" ]; then
|
||||
if [ -n "${image_warning_names:-}" ]; then
|
||||
add_reason "Image-Warnungen: \`${image_warnings:-0}\` (${image_warning_names})."
|
||||
else
|
||||
add_reason "Image-Warnungen: \`${image_warnings:-0}\`."
|
||||
fi
|
||||
fi
|
||||
[ "${containers_exited_nonzero:-0}" != "0" ] && add_reason "Container exited non-zero: \`${containers_exited_nonzero:-0}\`."
|
||||
[ "${host_recent_boot:-0}" = "1" ] && add_reason "Host-Reboot innerhalb der letzten 24 Stunden."
|
||||
[ "${backup_duration_drift:-0}" = "1" ] && add_reason "Backup-Dauer-Drift erkannt."
|
||||
[ "${noise_threshold_exceeded:-0}" != "0" ] && add_reason "Noise-Pattern ueber Eskalations-Schwelle: \`${noise_threshold_exceeded:-0}\`."
|
||||
|
||||
if [ "${prometheus_alerts_pending:-0}" != "0" ] && [ "${prometheus_alerts_pending:-0}" != "unknown" ]; then
|
||||
add_reason "Prometheus pending Alerts: \`${prometheus_alerts_pending:-0}\`."
|
||||
fi
|
||||
if [ "${prometheus_alerts_firing:-0}" != "0" ] && [ "${prometheus_alerts_firing:-0}" != "unknown" ]; then
|
||||
add_reason "Prometheus firing Alerts: \`${prometheus_alerts_firing:-0}\`."
|
||||
fi
|
||||
[ "${containers_unhealthy:-0}" != "0" ] && add_reason "Unhealthy Container: \`${containers_unhealthy:-0}\`."
|
||||
|
||||
if [ "$count" -eq 0 ]; then
|
||||
printf '%s\n' "- Keine direkten Ampel-Ausloeser im Summary-Set gefunden."
|
||||
fi
|
||||
}
|
||||
|
||||
print_notable_observations() {
|
||||
local count=0
|
||||
|
||||
add_observation() {
|
||||
printf '%s\n' "- $1"
|
||||
count=$((count + 1))
|
||||
}
|
||||
|
||||
if [ "${traefik_5xx:-0}" != "0" ] && [ "${traefik_5xx:-0}" != "unknown" ]; then
|
||||
if [ -n "${traefik_5xx_top:-}" ] && [ "${traefik_5xx_top:-none}" != "none" ]; then
|
||||
add_observation "Traefik 5xx: \`${traefik_5xx:-0}\` (Top-Gruppe: \`${traefik_5xx_top}\`)."
|
||||
else
|
||||
add_observation "Traefik 5xx: \`${traefik_5xx:-0}\`."
|
||||
fi
|
||||
fi
|
||||
if [ "${log_highlights:-0}" != "0" ] && [ "${log_highlights:-0}" != "unknown" ]; then
|
||||
add_observation "Log-Highlights: \`${log_highlights:-0}\` handlungsrelevante Treffer; Beispiele stehen in der Log-Auswertung."
|
||||
fi
|
||||
if printf '%s' "${log_volume_total:-0}" | grep -Eq '^[0-9]+$' && [ "${log_volume_total:-0}" -ge "$LOG_VOLUME_OBSERVE_THRESHOLD" ]; then
|
||||
add_observation "Log-Volumen: \`${log_volume_total:-0}\` Zeilen im Zeitraum; Top-Verursacher stehen im Log-Volumen-Abschnitt."
|
||||
fi
|
||||
if [ "${docker_events:-0}" != "0" ] && [ "${docker_events:-0}" != "unknown" ]; then
|
||||
add_observation "Docker Critical Events: \`${docker_events:-0}\`."
|
||||
fi
|
||||
|
||||
if [ "$count" -eq 0 ]; then
|
||||
printf '%s\n' "- Keine zusaetzlichen auffaelligen Beobachtungen im Management-Summary."
|
||||
fi
|
||||
}
|
||||
|
||||
collect_borg() {
|
||||
append "## Borg Backup"
|
||||
append ""
|
||||
@@ -584,6 +652,7 @@ collect_image_freshness() {
|
||||
local image_file="$TMP_DIR/images.tsv"
|
||||
local image_warnings=0
|
||||
local image_allowed=0
|
||||
local image_warning_names=""
|
||||
local now_epoch
|
||||
: > "$image_file"
|
||||
now_epoch="$(date +%s)"
|
||||
@@ -630,6 +699,7 @@ collect_image_freshness() {
|
||||
else
|
||||
note="ueberaltert"
|
||||
image_warnings=$((image_warnings + 1))
|
||||
image_warning_names="${image_warning_names:+$image_warning_names,}$name:${age_days}d"
|
||||
fi
|
||||
fi
|
||||
printf '%d\t%s\t%s\t%s\n' "$age_days" "$name" "$image_tag" "$note" >> "$image_file"
|
||||
@@ -637,6 +707,7 @@ collect_image_freshness() {
|
||||
|
||||
set_summary "image_warnings" "$image_warnings"
|
||||
set_summary "image_allowed" "$image_allowed"
|
||||
set_summary "image_warning_names" "$image_warning_names"
|
||||
|
||||
if [ ! -s "$image_file" ]; then
|
||||
append "- Keine Image-Daten verfuegbar."
|
||||
@@ -781,8 +852,16 @@ collect_traefik_5xx() {
|
||||
set_summary "traefik_5xx" "$count"
|
||||
|
||||
if [ "$count" -eq 0 ]; then
|
||||
set_summary "traefik_5xx_top" "none"
|
||||
append "- Keine 5xx-Antworten."
|
||||
else
|
||||
local top_group
|
||||
top_group="$(awk '{ code=$9; service=$12; gsub(/"/, "", service); counts[service " " code]++ } END { for (k in counts) print counts[k], k }' "$file" \
|
||||
| sort -nr \
|
||||
| head -n 1 \
|
||||
| awk '{ print $2 ":" $3 ":" $1 }' \
|
||||
| sed -E 's#[^A-Za-z0-9_.:@/-]+#_#g')"
|
||||
set_summary "traefik_5xx_top" "${top_group:-none}"
|
||||
append "- 5xx-Antworten: $count"
|
||||
append ""
|
||||
append "### Gruppiert nach Service/Code"
|
||||
@@ -1181,10 +1260,20 @@ write_report() {
|
||||
if [ "$REPORT_STATUS" = "OK" ]; then
|
||||
printf 'Im betrachteten Zeitraum zeigt das Homelab eine stabile Betriebslage. Das letzte Borg-Backup ist erfolgreich abgeschlossen, Prometheus meldet keine firing Alerts, keine unhealthy Container, Zertifikate und Storage im erwarteten Bereich.\n\n'
|
||||
elif [ "$REPORT_STATUS" = "WARNUNG" ]; then
|
||||
printf 'Im betrachteten Zeitraum gibt es Punkte, die Aufmerksamkeit verdienen. Der Betrieb ist nicht automatisch als kompromittiert zu bewerten, aber mindestens ein Signal (Backup, Pending Alert, Zertifikat, Storage, Image-Alter, Drift oder Reboot) weicht vom Normalzustand ab.\n\n'
|
||||
printf 'Im betrachteten Zeitraum gibt es Punkte, die Aufmerksamkeit verdienen. Der Betrieb ist nicht automatisch als kompromittiert zu bewerten; die konkreten Ampel-Ausloeser stehen direkt darunter.\n\n'
|
||||
else
|
||||
printf 'Im betrachteten Zeitraum liegt ein kritisches Betriebssignal vor. Der Bericht sollte zeitnah gelesen und die betroffenen Komponenten priorisiert geprueft werden.\n\n'
|
||||
fi
|
||||
printf '### Warum dieser Status?\n\n'
|
||||
if [ "$REPORT_STATUS" = "OK" ]; then
|
||||
printf '%s\n\n' "- Keine Ampel-Ausloeser im Summary-Set."
|
||||
else
|
||||
print_status_reasons
|
||||
printf '\n'
|
||||
fi
|
||||
printf '### Weitere auffaellige Beobachtungen\n\n'
|
||||
print_notable_observations
|
||||
printf '\n'
|
||||
printf '### Management-Bewertung\n\n'
|
||||
printf '%s\n' "- Status: \`$REPORT_STATUS\`"
|
||||
printf '%s\n' "- Borg Backup: \`${borg_status:-unknown}\`"
|
||||
|
||||
Reference in New Issue
Block a user