Files
homelab-infra/services/posture-check/posture-check.sh
T
2026-05-17 14:57:45 +02:00

285 lines
8.2 KiB
Bash
Executable File

#!/usr/bin/env bash
set -euo pipefail
OUTPUT_PATH="${OUTPUT_PATH:-/mnt/user/services/posture-check/last.json}"
NTFY_BASE_URL="${NTFY_BASE_URL:-https://ntfy.kaleschke.info}"
WARNING_TOPIC="${WARNING_TOPIC:-homelab-alerts}"
CRITICAL_TOPIC="${CRITICAL_TOPIC:-homelab-alerts}"
SEND_NTFY="${SEND_NTFY:-1}"
TMP_DIR="${TMP_DIR:-/tmp/kallilab-posture-check}"
ALLOW_DISK1_NTFS="${ALLOW_DISK1_NTFS:-1}"
mkdir -p "$TMP_DIR"
RESULTS_FILE="$TMP_DIR/results.$$"
: > "$RESULTS_FILE"
cleanup() {
rm -f "$RESULTS_FILE"
}
trap cleanup EXIT
json_escape() {
sed \
-e 's/\\/\\\\/g' \
-e 's/"/\\"/g' \
-e 's/\t/\\t/g'
}
add_result() {
local severity="$1"
local name="$2"
local message="$3"
printf '%s\t%s\t%s\n' "$severity" "$name" "$message" >> "$RESULTS_FILE"
}
need_cmd() {
if ! command -v "$1" >/dev/null 2>&1; then
add_result "warning" "command_$1" "Command missing: $1"
return 1
fi
}
check_fstype() {
local path="$1"
local expected="$2"
local severity="$3"
local name="$4"
local actual
if ! command -v findmnt >/dev/null 2>&1; then
add_result "warning" "$name" "Cannot check $path filesystem because findmnt is missing"
return
fi
if ! actual="$(findmnt -no FSTYPE "$path" 2>/dev/null)"; then
add_result "$severity" "$name" "Mount not found: $path"
return
fi
if [ "$actual" = "$expected" ]; then
add_result "ok" "$name" "$path filesystem is $actual"
else
add_result "$severity" "$name" "$path filesystem is $actual, expected $expected"
fi
}
check_no_ntfs_on_core_mounts() {
local hits
local pattern="^/mnt/(cache|disk1)(/|$)"
if ! command -v findmnt >/dev/null 2>&1; then
add_result "warning" "no_ntfs_core_mounts" "Cannot check NTFS mounts because findmnt is missing"
return
fi
if [ "$ALLOW_DISK1_NTFS" = "1" ]; then
pattern="^/mnt/cache(/|$)"
fi
hits="$(findmnt -rn -o TARGET,FSTYPE 2>/dev/null | awk -v pattern="$pattern" '$1 ~ pattern && ($2 == "ntfs3" || $2 == "fuseblk") { print $1 ":" $2 }' | paste -sd ',' -)"
if [ -n "$hits" ]; then
add_result "critical" "no_ntfs_core_mounts" "NTFS-like filesystem on core mount: $hits"
elif [ "$ALLOW_DISK1_NTFS" = "1" ]; then
add_result "warning" "no_ntfs_core_mounts" "No NTFS on /mnt/cache; /mnt/disk1 NTFS is temporarily allowed until Disk1 phase 2 migration"
else
add_result "ok" "no_ntfs_core_mounts" "No ntfs3/fuseblk mounts below /mnt/cache or /mnt/disk1"
fi
}
check_mover_drift() {
local path="/mnt/disk1/appdata"
if [ ! -d "$path" ]; then
add_result "ok" "mover_drift_appdata" "$path does not exist"
return
fi
if find "$path" -mindepth 1 -print -quit | grep -q .; then
add_result "critical" "mover_drift_appdata" "$path contains entries; appdata should stay cache-only"
else
add_result "ok" "mover_drift_appdata" "$path is empty"
fi
}
check_inode_usage() {
local path="$1"
local max_percent="$2"
local name="$3"
local use_percent
if ! use_percent="$(df -Pi "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"; then
add_result "warning" "$name" "Cannot read inode usage for $path"
return
fi
if ! printf '%s' "$use_percent" | grep -Eq '^[0-9]+$'; then
add_result "warning" "$name" "$path inode usage unavailable (${use_percent:-unknown})"
return
fi
if [ "$use_percent" -lt "$max_percent" ]; then
add_result "ok" "$name" "$path inode usage ${use_percent}%"
else
add_result "warning" "$name" "$path inode usage ${use_percent}% >= ${max_percent}%"
fi
}
check_filesystem_usage() {
local path="$1"
local max_percent="$2"
local name="$3"
local severity="$4"
local use_percent
if ! use_percent="$(df -P "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"; then
add_result "warning" "$name" "Cannot read filesystem usage for $path"
return
fi
if [ "$use_percent" -lt "$max_percent" ]; then
add_result "ok" "$name" "$path usage ${use_percent}%"
else
add_result "$severity" "$name" "$path usage ${use_percent}% >= ${max_percent}%"
fi
}
check_nvme_smart() {
local device="${NVME_DEVICE:-/dev/nvme0n1}"
local smart
local warning
local percentage_used
local media_errors
if ! need_cmd nvme; then
return
fi
if ! smart="$(nvme smart-log "$device" 2>/dev/null)"; then
add_result "critical" "nvme_smart" "Cannot read nvme smart-log for $device"
return
fi
warning="$(printf '%s\n' "$smart" | awk -F: '/critical_warning/ { gsub(/[[:space:]]/, "", $2); print $2; exit }')"
percentage_used="$(printf '%s\n' "$smart" | awk -F: '/percentage_used/ { gsub(/[^0-9]/, "", $2); print $2; exit }')"
media_errors="$(printf '%s\n' "$smart" | awk -F: '/media_errors/ { gsub(/[^0-9]/, "", $2); print $2; exit }')"
if [ "${warning:-0}" = "0" ] || [ "${warning:-0}" = "0x00" ]; then
add_result "ok" "nvme_critical_warning" "$device critical_warning ${warning:-0}"
else
add_result "critical" "nvme_critical_warning" "$device critical_warning ${warning}"
fi
if [ -n "${percentage_used:-}" ] && [ "$percentage_used" -lt 80 ]; then
add_result "ok" "nvme_percentage_used" "$device percentage_used ${percentage_used}%"
else
add_result "critical" "nvme_percentage_used" "$device percentage_used ${percentage_used:-unknown}, expected <80"
fi
if [ "${media_errors:-0}" = "0" ]; then
add_result "ok" "nvme_media_errors" "$device media_errors 0"
else
add_result "warning" "nvme_media_errors" "$device media_errors ${media_errors}"
fi
}
send_ntfy() {
local severity="$1"
local topic="$2"
local body="$3"
if [ "$SEND_NTFY" != "1" ]; then
return
fi
if command -v curl >/dev/null 2>&1; then
printf '%s\n' "$body" | curl -fsS \
-H "Title: KalliLab posture-check $severity" \
-H "Priority: high" \
--data-binary @- \
"$NTFY_BASE_URL/$topic" >/dev/null || true
fi
}
write_json() {
local timestamp
local critical_count
local warning_count
local status
local first=1
timestamp="$(date -Iseconds)"
critical_count="$(awk -F '\t' '$1 == "critical" { count++ } END { print count + 0 }' "$RESULTS_FILE")"
warning_count="$(awk -F '\t' '$1 == "warning" { count++ } END { print count + 0 }' "$RESULTS_FILE")"
if [ "$critical_count" -gt 0 ]; then
status="critical"
elif [ "$warning_count" -gt 0 ]; then
status="warning"
else
status="ok"
fi
mkdir -p "$(dirname "$OUTPUT_PATH")"
{
printf '{\n'
printf ' "timestamp": "%s",\n' "$(printf '%s' "$timestamp" | json_escape)"
printf ' "status": "%s",\n' "$status"
printf ' "critical_count": %s,\n' "$critical_count"
printf ' "warning_count": %s,\n' "$warning_count"
printf ' "checks": [\n'
while IFS="$(printf '\t')" read -r severity name message; do
if [ "$first" -eq 0 ]; then
printf ',\n'
fi
first=0
printf ' {"severity":"%s","name":"%s","message":"%s"}' \
"$(printf '%s' "$severity" | json_escape)" \
"$(printf '%s' "$name" | json_escape)" \
"$(printf '%s' "$message" | json_escape)"
done < "$RESULTS_FILE"
printf '\n ]\n'
printf '}\n'
} > "$OUTPUT_PATH.tmp"
mv "$OUTPUT_PATH.tmp" "$OUTPUT_PATH"
cat "$OUTPUT_PATH"
if [ "$status" = "critical" ]; then
send_ntfy "critical" "$CRITICAL_TOPIC" "Posture-check critical: $critical_count critical, $warning_count warning. See $OUTPUT_PATH"
return 2
fi
if [ "$status" = "warning" ]; then
send_ntfy "warning" "$WARNING_TOPIC" "Posture-check warning: $warning_count warning. See $OUTPUT_PATH"
return 1
fi
}
main() {
need_cmd findmnt || true
need_cmd df || true
need_cmd awk || true
check_fstype "/mnt/cache" "xfs" "critical" "cache_fstype"
if [ "$ALLOW_DISK1_NTFS" = "1" ]; then
check_fstype "/mnt/disk1" "ntfs3" "warning" "disk1_fstype"
else
check_fstype "/mnt/disk1" "xfs" "critical" "disk1_fstype"
fi
check_no_ntfs_on_core_mounts
check_mover_drift
check_inode_usage "/mnt/cache" 80 "cache_inode_usage"
check_inode_usage "/mnt/disk1" 80 "disk1_inode_usage"
check_filesystem_usage "/mnt/cache" 70 "cache_fill_level" "warning"
for share in appdata system domains; do
if [ -e "/mnt/user/$share" ]; then
check_filesystem_usage "/mnt/user/$share" 70 "share_${share}_fill_level" "warning"
else
add_result "warning" "share_${share}_fill_level" "/mnt/user/$share missing"
fi
done
check_nvme_smart
write_json
}
main "$@"