cd650b19ac
Operational hardening across several services after live incident analysis between 2026-05-18 and 2026-05-20: - Gitea: disable public registration and OpenID signup/signin to stop the external POST / 5xx bursts that triggered availability alerts. New repo-wide policy requires every productive Micha/homelab-infra Komodo stack to ship with an active Gitea->Komodo webhook on the current stack ID (documented in CLAUDE.md, AI_CONTEXT.md, WORKFLOW.md). - posture-check: extract the Disk1 fstype check into its own function so the documented Disk1 NTFS exception no longer raises ntfy warnings, skip POSIX inode checks on NTFS, and dedup ntfy alerts via a fingerprint state file with ALERT_REPEAT_SECONDS (default 24h). Repeat-spam on the same cause now suppressed. - docker-critical-events: parse the event JSON for container name, action, exit code and signal; drop `die exit=0` events (clean stops); ship a structured ntfy message instead of the raw event line. - Borg UI: mount /mnt/user/services into the backup container as /local/services:ro and include homelab-infra, stacks and posture-check in all-important-sources.txt. RESTORE_MATRIX and DISASTER_RECOVERY updated accordingly. - Unraid user scripts: document the new homelab-operations-report-daily cron job and the SMTP password file it expects on the host. - MIGRATION_LOG: capture the four live events from this window - Gitea 5xx burst + signup closure, Komodo webhook reconciliation, posture-check host-version verification, Borg scope extension, and Traefik 5xx alert detuning. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
395 lines
11 KiB
Bash
Executable File
395 lines
11 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
OUTPUT_PATH="${OUTPUT_PATH:-/mnt/user/services/posture-check/last.json}"
|
|
NTFY_BASE_URL="${NTFY_BASE_URL:-https://ntfy.kaleschke.info}"
|
|
WARNING_TOPIC="${WARNING_TOPIC:-homelab-alerts}"
|
|
CRITICAL_TOPIC="${CRITICAL_TOPIC:-homelab-alerts}"
|
|
SEND_NTFY="${SEND_NTFY:-1}"
|
|
TMP_DIR="${TMP_DIR:-/tmp/kallilab-posture-check}"
|
|
ALLOW_DISK1_NTFS="${ALLOW_DISK1_NTFS:-1}"
|
|
ALERT_STATE_PATH="${ALERT_STATE_PATH:-/mnt/user/services/posture-check/last-alert.state}"
|
|
ALERT_REPEAT_SECONDS="${ALERT_REPEAT_SECONDS:-86400}"
|
|
|
|
mkdir -p "$TMP_DIR"
|
|
RESULTS_FILE="$TMP_DIR/results.$$"
|
|
: > "$RESULTS_FILE"
|
|
|
|
cleanup() {
|
|
rm -f "$RESULTS_FILE"
|
|
}
|
|
trap cleanup EXIT
|
|
|
|
json_escape() {
|
|
sed \
|
|
-e 's/\\/\\\\/g' \
|
|
-e 's/"/\\"/g' \
|
|
-e 's/\t/\\t/g'
|
|
}
|
|
|
|
add_result() {
|
|
local severity="$1"
|
|
local name="$2"
|
|
local message="$3"
|
|
printf '%s\t%s\t%s\n' "$severity" "$name" "$message" >> "$RESULTS_FILE"
|
|
}
|
|
|
|
need_cmd() {
|
|
if ! command -v "$1" >/dev/null 2>&1; then
|
|
add_result "warning" "command_$1" "Command missing: $1"
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
check_fstype() {
|
|
local path="$1"
|
|
local expected="$2"
|
|
local severity="$3"
|
|
local name="$4"
|
|
local actual
|
|
|
|
if ! command -v findmnt >/dev/null 2>&1; then
|
|
add_result "warning" "$name" "Cannot check $path filesystem because findmnt is missing"
|
|
return
|
|
fi
|
|
|
|
if ! actual="$(findmnt -no FSTYPE "$path" 2>/dev/null)"; then
|
|
add_result "$severity" "$name" "Mount not found: $path"
|
|
return
|
|
fi
|
|
|
|
if [ "$actual" = "$expected" ]; then
|
|
add_result "ok" "$name" "$path filesystem is $actual"
|
|
else
|
|
add_result "$severity" "$name" "$path filesystem is $actual, expected $expected"
|
|
fi
|
|
}
|
|
|
|
check_disk1_fstype() {
|
|
local actual
|
|
|
|
if ! command -v findmnt >/dev/null 2>&1; then
|
|
add_result "warning" "disk1_fstype" "Cannot check /mnt/disk1 filesystem because findmnt is missing"
|
|
return
|
|
fi
|
|
|
|
if ! actual="$(findmnt -no FSTYPE "/mnt/disk1" 2>/dev/null)"; then
|
|
add_result "warning" "disk1_fstype" "Mount not found: /mnt/disk1"
|
|
return
|
|
fi
|
|
|
|
if [ "$ALLOW_DISK1_NTFS" = "1" ]; then
|
|
if [ "$actual" = "ntfs3" ] || [ "$actual" = "fuseblk" ]; then
|
|
add_result "ok" "disk1_fstype" "/mnt/disk1 filesystem is $actual; temporarily allowed until Disk1 phase 2 migration"
|
|
else
|
|
add_result "warning" "disk1_fstype" "/mnt/disk1 filesystem is $actual, expected ntfs3/fuseblk during temporary Disk1 migration exception"
|
|
fi
|
|
else
|
|
if [ "$actual" = "xfs" ]; then
|
|
add_result "ok" "disk1_fstype" "/mnt/disk1 filesystem is $actual"
|
|
else
|
|
add_result "critical" "disk1_fstype" "/mnt/disk1 filesystem is $actual, expected xfs"
|
|
fi
|
|
fi
|
|
}
|
|
|
|
check_no_ntfs_on_core_mounts() {
|
|
local hits
|
|
local pattern="^/mnt/(cache|disk1)(/|$)"
|
|
|
|
if ! command -v findmnt >/dev/null 2>&1; then
|
|
add_result "warning" "no_ntfs_core_mounts" "Cannot check NTFS mounts because findmnt is missing"
|
|
return
|
|
fi
|
|
|
|
if [ "$ALLOW_DISK1_NTFS" = "1" ]; then
|
|
pattern="^/mnt/cache(/|$)"
|
|
fi
|
|
|
|
hits="$(findmnt -rn -o TARGET,FSTYPE 2>/dev/null | awk -v pattern="$pattern" '$1 ~ pattern && ($2 == "ntfs3" || $2 == "fuseblk") { print $1 ":" $2 }' | paste -sd ',' -)"
|
|
if [ -n "$hits" ]; then
|
|
add_result "critical" "no_ntfs_core_mounts" "NTFS-like filesystem on core mount: $hits"
|
|
elif [ "$ALLOW_DISK1_NTFS" = "1" ]; then
|
|
add_result "ok" "no_ntfs_core_mounts" "No NTFS on /mnt/cache; /mnt/disk1 NTFS is temporarily allowed until Disk1 phase 2 migration"
|
|
else
|
|
add_result "ok" "no_ntfs_core_mounts" "No ntfs3/fuseblk mounts below /mnt/cache or /mnt/disk1"
|
|
fi
|
|
}
|
|
|
|
check_mover_drift() {
|
|
local path="/mnt/disk1/appdata"
|
|
if [ ! -d "$path" ]; then
|
|
add_result "ok" "mover_drift_appdata" "$path does not exist"
|
|
return
|
|
fi
|
|
|
|
if find "$path" -mindepth 1 -print -quit | grep -q .; then
|
|
add_result "critical" "mover_drift_appdata" "$path contains entries; appdata should stay cache-only"
|
|
else
|
|
add_result "ok" "mover_drift_appdata" "$path is empty"
|
|
fi
|
|
}
|
|
|
|
check_inode_usage() {
|
|
local path="$1"
|
|
local max_percent="$2"
|
|
local name="$3"
|
|
local use_percent
|
|
|
|
if ! use_percent="$(df -Pi "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"; then
|
|
add_result "warning" "$name" "Cannot read inode usage for $path"
|
|
return
|
|
fi
|
|
if ! printf '%s' "$use_percent" | grep -Eq '^[0-9]+$'; then
|
|
add_result "warning" "$name" "$path inode usage unavailable (${use_percent:-unknown})"
|
|
return
|
|
fi
|
|
|
|
if [ "$use_percent" -lt "$max_percent" ]; then
|
|
add_result "ok" "$name" "$path inode usage ${use_percent}%"
|
|
else
|
|
add_result "warning" "$name" "$path inode usage ${use_percent}% >= ${max_percent}%"
|
|
fi
|
|
}
|
|
|
|
check_disk1_inode_usage() {
|
|
if [ "$ALLOW_DISK1_NTFS" = "1" ]; then
|
|
add_result "ok" "disk1_inode_usage" "/mnt/disk1 inode usage skipped; NTFS transition filesystem does not expose POSIX inode usage"
|
|
return
|
|
fi
|
|
|
|
check_inode_usage "/mnt/disk1" 80 "disk1_inode_usage"
|
|
}
|
|
|
|
check_filesystem_usage() {
|
|
local path="$1"
|
|
local max_percent="$2"
|
|
local name="$3"
|
|
local severity="$4"
|
|
local use_percent
|
|
|
|
if ! use_percent="$(df -P "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"; then
|
|
add_result "warning" "$name" "Cannot read filesystem usage for $path"
|
|
return
|
|
fi
|
|
|
|
if [ "$use_percent" -lt "$max_percent" ]; then
|
|
add_result "ok" "$name" "$path usage ${use_percent}%"
|
|
else
|
|
add_result "$severity" "$name" "$path usage ${use_percent}% >= ${max_percent}%"
|
|
fi
|
|
}
|
|
|
|
check_nvme_smart() {
|
|
local device="${NVME_DEVICE:-/dev/nvme0n1}"
|
|
local smart
|
|
local warning
|
|
local percentage_used
|
|
local media_errors
|
|
|
|
if ! need_cmd nvme; then
|
|
return
|
|
fi
|
|
|
|
if ! smart="$(nvme smart-log "$device" 2>/dev/null)"; then
|
|
add_result "critical" "nvme_smart" "Cannot read nvme smart-log for $device"
|
|
return
|
|
fi
|
|
|
|
warning="$(printf '%s\n' "$smart" | awk -F: '/critical_warning/ { gsub(/[[:space:]]/, "", $2); print $2; exit }')"
|
|
percentage_used="$(printf '%s\n' "$smart" | awk -F: '/percentage_used/ { gsub(/[^0-9]/, "", $2); print $2; exit }')"
|
|
media_errors="$(printf '%s\n' "$smart" | awk -F: '/media_errors/ { gsub(/[^0-9]/, "", $2); print $2; exit }')"
|
|
|
|
if [ "${warning:-0}" = "0" ] || [ "${warning:-0}" = "0x00" ]; then
|
|
add_result "ok" "nvme_critical_warning" "$device critical_warning ${warning:-0}"
|
|
else
|
|
add_result "critical" "nvme_critical_warning" "$device critical_warning ${warning}"
|
|
fi
|
|
|
|
if [ -n "${percentage_used:-}" ] && [ "$percentage_used" -lt 80 ]; then
|
|
add_result "ok" "nvme_percentage_used" "$device percentage_used ${percentage_used}%"
|
|
else
|
|
add_result "critical" "nvme_percentage_used" "$device percentage_used ${percentage_used:-unknown}, expected <80"
|
|
fi
|
|
|
|
if [ "${media_errors:-0}" = "0" ]; then
|
|
add_result "ok" "nvme_media_errors" "$device media_errors 0"
|
|
else
|
|
add_result "warning" "nvme_media_errors" "$device media_errors ${media_errors}"
|
|
fi
|
|
}
|
|
|
|
send_ntfy() {
|
|
local severity="$1"
|
|
local topic="$2"
|
|
local body="$3"
|
|
|
|
if [ "$SEND_NTFY" != "1" ]; then
|
|
return
|
|
fi
|
|
|
|
if command -v curl >/dev/null 2>&1; then
|
|
printf '%s\n' "$body" | curl -fsS \
|
|
-H "Title: KalliLab posture-check $severity" \
|
|
-H "Priority: high" \
|
|
--data-binary @- \
|
|
"$NTFY_BASE_URL/$topic" >/dev/null || true
|
|
fi
|
|
}
|
|
|
|
alert_fingerprint() {
|
|
awk -F '\t' '$1 != "ok" { printf "%s|%s|%s\n", $1, $2, $3 }' "$RESULTS_FILE" | cksum | awk '{ print $1 ":" $2 }'
|
|
}
|
|
|
|
alert_summary() {
|
|
awk -F '\t' '$1 != "ok" { printf "%s:%s; ", $1, $2 }' "$RESULTS_FILE" | sed 's/; $//'
|
|
}
|
|
|
|
should_send_alert() {
|
|
local fingerprint="$1"
|
|
local now
|
|
local last_fingerprint=""
|
|
local last_sent="0"
|
|
|
|
now="$(date +%s)"
|
|
|
|
if ! printf '%s' "$ALERT_REPEAT_SECONDS" | grep -Eq '^[0-9]+$'; then
|
|
ALERT_REPEAT_SECONDS=86400
|
|
fi
|
|
|
|
if [ -f "$ALERT_STATE_PATH" ]; then
|
|
IFS="$(printf '\t')" read -r last_fingerprint last_sent < "$ALERT_STATE_PATH" || true
|
|
fi
|
|
|
|
if [ "$fingerprint" != "$last_fingerprint" ]; then
|
|
return 0
|
|
fi
|
|
|
|
if ! printf '%s' "$last_sent" | grep -Eq '^[0-9]+$'; then
|
|
return 0
|
|
fi
|
|
|
|
if [ $((now - last_sent)) -ge "$ALERT_REPEAT_SECONDS" ]; then
|
|
return 0
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
remember_alert() {
|
|
local fingerprint="$1"
|
|
local now
|
|
|
|
now="$(date +%s)"
|
|
mkdir -p "$(dirname "$ALERT_STATE_PATH")"
|
|
printf '%s\t%s\n' "$fingerprint" "$now" > "$ALERT_STATE_PATH.tmp"
|
|
mv "$ALERT_STATE_PATH.tmp" "$ALERT_STATE_PATH"
|
|
}
|
|
|
|
clear_alert_state() {
|
|
rm -f "$ALERT_STATE_PATH" "$ALERT_STATE_PATH.tmp"
|
|
}
|
|
|
|
send_alert_once() {
|
|
local severity="$1"
|
|
local topic="$2"
|
|
local body="$3"
|
|
local fingerprint
|
|
local summary
|
|
|
|
fingerprint="$(alert_fingerprint)"
|
|
summary="$(alert_summary)"
|
|
|
|
if [ -n "$summary" ]; then
|
|
body="$body Checks: $summary"
|
|
fi
|
|
|
|
if should_send_alert "$fingerprint"; then
|
|
send_ntfy "$severity" "$topic" "$body"
|
|
remember_alert "$fingerprint"
|
|
fi
|
|
}
|
|
|
|
write_json() {
|
|
local timestamp
|
|
local critical_count
|
|
local warning_count
|
|
local status
|
|
local first=1
|
|
|
|
timestamp="$(date -Iseconds)"
|
|
critical_count="$(awk -F '\t' '$1 == "critical" { count++ } END { print count + 0 }' "$RESULTS_FILE")"
|
|
warning_count="$(awk -F '\t' '$1 == "warning" { count++ } END { print count + 0 }' "$RESULTS_FILE")"
|
|
|
|
if [ "$critical_count" -gt 0 ]; then
|
|
status="critical"
|
|
elif [ "$warning_count" -gt 0 ]; then
|
|
status="warning"
|
|
else
|
|
status="ok"
|
|
fi
|
|
|
|
mkdir -p "$(dirname "$OUTPUT_PATH")"
|
|
{
|
|
printf '{\n'
|
|
printf ' "timestamp": "%s",\n' "$(printf '%s' "$timestamp" | json_escape)"
|
|
printf ' "status": "%s",\n' "$status"
|
|
printf ' "critical_count": %s,\n' "$critical_count"
|
|
printf ' "warning_count": %s,\n' "$warning_count"
|
|
printf ' "checks": [\n'
|
|
while IFS="$(printf '\t')" read -r severity name message; do
|
|
if [ "$first" -eq 0 ]; then
|
|
printf ',\n'
|
|
fi
|
|
first=0
|
|
printf ' {"severity":"%s","name":"%s","message":"%s"}' \
|
|
"$(printf '%s' "$severity" | json_escape)" \
|
|
"$(printf '%s' "$name" | json_escape)" \
|
|
"$(printf '%s' "$message" | json_escape)"
|
|
done < "$RESULTS_FILE"
|
|
printf '\n ]\n'
|
|
printf '}\n'
|
|
} > "$OUTPUT_PATH.tmp"
|
|
mv "$OUTPUT_PATH.tmp" "$OUTPUT_PATH"
|
|
|
|
cat "$OUTPUT_PATH"
|
|
|
|
if [ "$status" = "critical" ]; then
|
|
send_alert_once "critical" "$CRITICAL_TOPIC" "Posture-check critical: $critical_count critical, $warning_count warning. See $OUTPUT_PATH"
|
|
return 2
|
|
fi
|
|
if [ "$status" = "warning" ]; then
|
|
send_alert_once "warning" "$WARNING_TOPIC" "Posture-check warning: $warning_count warning. See $OUTPUT_PATH"
|
|
return 1
|
|
fi
|
|
|
|
clear_alert_state
|
|
}
|
|
|
|
main() {
|
|
need_cmd findmnt || true
|
|
need_cmd df || true
|
|
need_cmd awk || true
|
|
|
|
check_fstype "/mnt/cache" "xfs" "critical" "cache_fstype"
|
|
check_disk1_fstype
|
|
check_no_ntfs_on_core_mounts
|
|
check_mover_drift
|
|
check_inode_usage "/mnt/cache" 80 "cache_inode_usage"
|
|
check_disk1_inode_usage
|
|
check_filesystem_usage "/mnt/cache" 70 "cache_fill_level" "warning"
|
|
|
|
for share in appdata system domains; do
|
|
if [ -e "/mnt/user/$share" ]; then
|
|
check_filesystem_usage "/mnt/user/$share" 70 "share_${share}_fill_level" "warning"
|
|
else
|
|
add_result "warning" "share_${share}_fill_level" "/mnt/user/$share missing"
|
|
fi
|
|
done
|
|
|
|
check_nvme_smart
|
|
write_json
|
|
}
|
|
|
|
main "$@"
|