Files
homelab-infra/services/posture-check/posture-check.sh
T
Micha cd650b19ac Close Gitea signup, dedup posture-check alerts, extend Borg scope
Operational hardening across several services after live incident
analysis between 2026-05-18 and 2026-05-20:

- Gitea: disable public registration and OpenID signup/signin to
  stop the external POST / 5xx bursts that triggered availability
  alerts. New repo-wide policy requires every productive
  Micha/homelab-infra Komodo stack to ship with an active
  Gitea->Komodo webhook on the current stack ID (documented in
  CLAUDE.md, AI_CONTEXT.md, WORKFLOW.md).
- posture-check: extract the Disk1 fstype check into its own
  function so the documented Disk1 NTFS exception no longer raises
  ntfy warnings, skip POSIX inode checks on NTFS, and dedup ntfy
  alerts via a fingerprint state file with ALERT_REPEAT_SECONDS
  (default 24h). Repeat-spam on the same cause now suppressed.
- docker-critical-events: parse the event JSON for container name,
  action, exit code and signal; drop `die exit=0` events (clean
  stops); ship a structured ntfy message instead of the raw event
  line.
- Borg UI: mount /mnt/user/services into the backup container as
  /local/services:ro and include homelab-infra, stacks and
  posture-check in all-important-sources.txt. RESTORE_MATRIX and
  DISASTER_RECOVERY updated accordingly.
- Unraid user scripts: document the new
  homelab-operations-report-daily cron job and the SMTP password
  file it expects on the host.
- MIGRATION_LOG: capture the four live events from this window -
  Gitea 5xx burst + signup closure, Komodo webhook reconciliation,
  posture-check host-version verification, Borg scope extension,
  and Traefik 5xx alert detuning.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-23 11:05:35 +02:00

395 lines
11 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
OUTPUT_PATH="${OUTPUT_PATH:-/mnt/user/services/posture-check/last.json}"
NTFY_BASE_URL="${NTFY_BASE_URL:-https://ntfy.kaleschke.info}"
WARNING_TOPIC="${WARNING_TOPIC:-homelab-alerts}"
CRITICAL_TOPIC="${CRITICAL_TOPIC:-homelab-alerts}"
SEND_NTFY="${SEND_NTFY:-1}"
TMP_DIR="${TMP_DIR:-/tmp/kallilab-posture-check}"
ALLOW_DISK1_NTFS="${ALLOW_DISK1_NTFS:-1}"
ALERT_STATE_PATH="${ALERT_STATE_PATH:-/mnt/user/services/posture-check/last-alert.state}"
ALERT_REPEAT_SECONDS="${ALERT_REPEAT_SECONDS:-86400}"
mkdir -p "$TMP_DIR"
RESULTS_FILE="$TMP_DIR/results.$$"
: > "$RESULTS_FILE"
cleanup() {
rm -f "$RESULTS_FILE"
}
trap cleanup EXIT
json_escape() {
sed \
-e 's/\\/\\\\/g' \
-e 's/"/\\"/g' \
-e 's/\t/\\t/g'
}
add_result() {
local severity="$1"
local name="$2"
local message="$3"
printf '%s\t%s\t%s\n' "$severity" "$name" "$message" >> "$RESULTS_FILE"
}
need_cmd() {
if ! command -v "$1" >/dev/null 2>&1; then
add_result "warning" "command_$1" "Command missing: $1"
return 1
fi
}
check_fstype() {
local path="$1"
local expected="$2"
local severity="$3"
local name="$4"
local actual
if ! command -v findmnt >/dev/null 2>&1; then
add_result "warning" "$name" "Cannot check $path filesystem because findmnt is missing"
return
fi
if ! actual="$(findmnt -no FSTYPE "$path" 2>/dev/null)"; then
add_result "$severity" "$name" "Mount not found: $path"
return
fi
if [ "$actual" = "$expected" ]; then
add_result "ok" "$name" "$path filesystem is $actual"
else
add_result "$severity" "$name" "$path filesystem is $actual, expected $expected"
fi
}
check_disk1_fstype() {
local actual
if ! command -v findmnt >/dev/null 2>&1; then
add_result "warning" "disk1_fstype" "Cannot check /mnt/disk1 filesystem because findmnt is missing"
return
fi
if ! actual="$(findmnt -no FSTYPE "/mnt/disk1" 2>/dev/null)"; then
add_result "warning" "disk1_fstype" "Mount not found: /mnt/disk1"
return
fi
if [ "$ALLOW_DISK1_NTFS" = "1" ]; then
if [ "$actual" = "ntfs3" ] || [ "$actual" = "fuseblk" ]; then
add_result "ok" "disk1_fstype" "/mnt/disk1 filesystem is $actual; temporarily allowed until Disk1 phase 2 migration"
else
add_result "warning" "disk1_fstype" "/mnt/disk1 filesystem is $actual, expected ntfs3/fuseblk during temporary Disk1 migration exception"
fi
else
if [ "$actual" = "xfs" ]; then
add_result "ok" "disk1_fstype" "/mnt/disk1 filesystem is $actual"
else
add_result "critical" "disk1_fstype" "/mnt/disk1 filesystem is $actual, expected xfs"
fi
fi
}
check_no_ntfs_on_core_mounts() {
local hits
local pattern="^/mnt/(cache|disk1)(/|$)"
if ! command -v findmnt >/dev/null 2>&1; then
add_result "warning" "no_ntfs_core_mounts" "Cannot check NTFS mounts because findmnt is missing"
return
fi
if [ "$ALLOW_DISK1_NTFS" = "1" ]; then
pattern="^/mnt/cache(/|$)"
fi
hits="$(findmnt -rn -o TARGET,FSTYPE 2>/dev/null | awk -v pattern="$pattern" '$1 ~ pattern && ($2 == "ntfs3" || $2 == "fuseblk") { print $1 ":" $2 }' | paste -sd ',' -)"
if [ -n "$hits" ]; then
add_result "critical" "no_ntfs_core_mounts" "NTFS-like filesystem on core mount: $hits"
elif [ "$ALLOW_DISK1_NTFS" = "1" ]; then
add_result "ok" "no_ntfs_core_mounts" "No NTFS on /mnt/cache; /mnt/disk1 NTFS is temporarily allowed until Disk1 phase 2 migration"
else
add_result "ok" "no_ntfs_core_mounts" "No ntfs3/fuseblk mounts below /mnt/cache or /mnt/disk1"
fi
}
check_mover_drift() {
local path="/mnt/disk1/appdata"
if [ ! -d "$path" ]; then
add_result "ok" "mover_drift_appdata" "$path does not exist"
return
fi
if find "$path" -mindepth 1 -print -quit | grep -q .; then
add_result "critical" "mover_drift_appdata" "$path contains entries; appdata should stay cache-only"
else
add_result "ok" "mover_drift_appdata" "$path is empty"
fi
}
check_inode_usage() {
local path="$1"
local max_percent="$2"
local name="$3"
local use_percent
if ! use_percent="$(df -Pi "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"; then
add_result "warning" "$name" "Cannot read inode usage for $path"
return
fi
if ! printf '%s' "$use_percent" | grep -Eq '^[0-9]+$'; then
add_result "warning" "$name" "$path inode usage unavailable (${use_percent:-unknown})"
return
fi
if [ "$use_percent" -lt "$max_percent" ]; then
add_result "ok" "$name" "$path inode usage ${use_percent}%"
else
add_result "warning" "$name" "$path inode usage ${use_percent}% >= ${max_percent}%"
fi
}
check_disk1_inode_usage() {
if [ "$ALLOW_DISK1_NTFS" = "1" ]; then
add_result "ok" "disk1_inode_usage" "/mnt/disk1 inode usage skipped; NTFS transition filesystem does not expose POSIX inode usage"
return
fi
check_inode_usage "/mnt/disk1" 80 "disk1_inode_usage"
}
check_filesystem_usage() {
local path="$1"
local max_percent="$2"
local name="$3"
local severity="$4"
local use_percent
if ! use_percent="$(df -P "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"; then
add_result "warning" "$name" "Cannot read filesystem usage for $path"
return
fi
if [ "$use_percent" -lt "$max_percent" ]; then
add_result "ok" "$name" "$path usage ${use_percent}%"
else
add_result "$severity" "$name" "$path usage ${use_percent}% >= ${max_percent}%"
fi
}
check_nvme_smart() {
local device="${NVME_DEVICE:-/dev/nvme0n1}"
local smart
local warning
local percentage_used
local media_errors
if ! need_cmd nvme; then
return
fi
if ! smart="$(nvme smart-log "$device" 2>/dev/null)"; then
add_result "critical" "nvme_smart" "Cannot read nvme smart-log for $device"
return
fi
warning="$(printf '%s\n' "$smart" | awk -F: '/critical_warning/ { gsub(/[[:space:]]/, "", $2); print $2; exit }')"
percentage_used="$(printf '%s\n' "$smart" | awk -F: '/percentage_used/ { gsub(/[^0-9]/, "", $2); print $2; exit }')"
media_errors="$(printf '%s\n' "$smart" | awk -F: '/media_errors/ { gsub(/[^0-9]/, "", $2); print $2; exit }')"
if [ "${warning:-0}" = "0" ] || [ "${warning:-0}" = "0x00" ]; then
add_result "ok" "nvme_critical_warning" "$device critical_warning ${warning:-0}"
else
add_result "critical" "nvme_critical_warning" "$device critical_warning ${warning}"
fi
if [ -n "${percentage_used:-}" ] && [ "$percentage_used" -lt 80 ]; then
add_result "ok" "nvme_percentage_used" "$device percentage_used ${percentage_used}%"
else
add_result "critical" "nvme_percentage_used" "$device percentage_used ${percentage_used:-unknown}, expected <80"
fi
if [ "${media_errors:-0}" = "0" ]; then
add_result "ok" "nvme_media_errors" "$device media_errors 0"
else
add_result "warning" "nvme_media_errors" "$device media_errors ${media_errors}"
fi
}
send_ntfy() {
local severity="$1"
local topic="$2"
local body="$3"
if [ "$SEND_NTFY" != "1" ]; then
return
fi
if command -v curl >/dev/null 2>&1; then
printf '%s\n' "$body" | curl -fsS \
-H "Title: KalliLab posture-check $severity" \
-H "Priority: high" \
--data-binary @- \
"$NTFY_BASE_URL/$topic" >/dev/null || true
fi
}
alert_fingerprint() {
awk -F '\t' '$1 != "ok" { printf "%s|%s|%s\n", $1, $2, $3 }' "$RESULTS_FILE" | cksum | awk '{ print $1 ":" $2 }'
}
alert_summary() {
awk -F '\t' '$1 != "ok" { printf "%s:%s; ", $1, $2 }' "$RESULTS_FILE" | sed 's/; $//'
}
should_send_alert() {
local fingerprint="$1"
local now
local last_fingerprint=""
local last_sent="0"
now="$(date +%s)"
if ! printf '%s' "$ALERT_REPEAT_SECONDS" | grep -Eq '^[0-9]+$'; then
ALERT_REPEAT_SECONDS=86400
fi
if [ -f "$ALERT_STATE_PATH" ]; then
IFS="$(printf '\t')" read -r last_fingerprint last_sent < "$ALERT_STATE_PATH" || true
fi
if [ "$fingerprint" != "$last_fingerprint" ]; then
return 0
fi
if ! printf '%s' "$last_sent" | grep -Eq '^[0-9]+$'; then
return 0
fi
if [ $((now - last_sent)) -ge "$ALERT_REPEAT_SECONDS" ]; then
return 0
fi
return 1
}
remember_alert() {
local fingerprint="$1"
local now
now="$(date +%s)"
mkdir -p "$(dirname "$ALERT_STATE_PATH")"
printf '%s\t%s\n' "$fingerprint" "$now" > "$ALERT_STATE_PATH.tmp"
mv "$ALERT_STATE_PATH.tmp" "$ALERT_STATE_PATH"
}
clear_alert_state() {
rm -f "$ALERT_STATE_PATH" "$ALERT_STATE_PATH.tmp"
}
send_alert_once() {
local severity="$1"
local topic="$2"
local body="$3"
local fingerprint
local summary
fingerprint="$(alert_fingerprint)"
summary="$(alert_summary)"
if [ -n "$summary" ]; then
body="$body Checks: $summary"
fi
if should_send_alert "$fingerprint"; then
send_ntfy "$severity" "$topic" "$body"
remember_alert "$fingerprint"
fi
}
write_json() {
local timestamp
local critical_count
local warning_count
local status
local first=1
timestamp="$(date -Iseconds)"
critical_count="$(awk -F '\t' '$1 == "critical" { count++ } END { print count + 0 }' "$RESULTS_FILE")"
warning_count="$(awk -F '\t' '$1 == "warning" { count++ } END { print count + 0 }' "$RESULTS_FILE")"
if [ "$critical_count" -gt 0 ]; then
status="critical"
elif [ "$warning_count" -gt 0 ]; then
status="warning"
else
status="ok"
fi
mkdir -p "$(dirname "$OUTPUT_PATH")"
{
printf '{\n'
printf ' "timestamp": "%s",\n' "$(printf '%s' "$timestamp" | json_escape)"
printf ' "status": "%s",\n' "$status"
printf ' "critical_count": %s,\n' "$critical_count"
printf ' "warning_count": %s,\n' "$warning_count"
printf ' "checks": [\n'
while IFS="$(printf '\t')" read -r severity name message; do
if [ "$first" -eq 0 ]; then
printf ',\n'
fi
first=0
printf ' {"severity":"%s","name":"%s","message":"%s"}' \
"$(printf '%s' "$severity" | json_escape)" \
"$(printf '%s' "$name" | json_escape)" \
"$(printf '%s' "$message" | json_escape)"
done < "$RESULTS_FILE"
printf '\n ]\n'
printf '}\n'
} > "$OUTPUT_PATH.tmp"
mv "$OUTPUT_PATH.tmp" "$OUTPUT_PATH"
cat "$OUTPUT_PATH"
if [ "$status" = "critical" ]; then
send_alert_once "critical" "$CRITICAL_TOPIC" "Posture-check critical: $critical_count critical, $warning_count warning. See $OUTPUT_PATH"
return 2
fi
if [ "$status" = "warning" ]; then
send_alert_once "warning" "$WARNING_TOPIC" "Posture-check warning: $warning_count warning. See $OUTPUT_PATH"
return 1
fi
clear_alert_state
}
main() {
need_cmd findmnt || true
need_cmd df || true
need_cmd awk || true
check_fstype "/mnt/cache" "xfs" "critical" "cache_fstype"
check_disk1_fstype
check_no_ntfs_on_core_mounts
check_mover_drift
check_inode_usage "/mnt/cache" 80 "cache_inode_usage"
check_disk1_inode_usage
check_filesystem_usage "/mnt/cache" 70 "cache_fill_level" "warning"
for share in appdata system domains; do
if [ -e "/mnt/user/$share" ]; then
check_filesystem_usage "/mnt/user/$share" 70 "share_${share}_fill_level" "warning"
else
add_result "warning" "share_${share}_fill_level" "/mnt/user/$share missing"
fi
done
check_nvme_smart
write_json
}
main "$@"