#!/usr/bin/env bash set -euo pipefail OUTPUT_PATH="${OUTPUT_PATH:-/mnt/user/services/posture-check/last.json}" NTFY_BASE_URL="${NTFY_BASE_URL:-https://ntfy.kaleschke.info}" WARNING_TOPIC="${WARNING_TOPIC:-homelab-alerts}" CRITICAL_TOPIC="${CRITICAL_TOPIC:-homelab-alerts}" SEND_NTFY="${SEND_NTFY:-1}" TMP_DIR="${TMP_DIR:-/tmp/kallilab-posture-check}" ALLOW_DISK1_NTFS="${ALLOW_DISK1_NTFS:-0}" ALERT_STATE_PATH="${ALERT_STATE_PATH:-/mnt/user/services/posture-check/last-alert.state}" ALERT_REPEAT_SECONDS="${ALERT_REPEAT_SECONDS:-86400}" SKIP_AUTHELIA_DRIFT="${SKIP_AUTHELIA_DRIFT:-0}" AUTHELIA_DIFF_SCRIPT="${AUTHELIA_DIFF_SCRIPT:-/mnt/user/services/homelab-infra/services/authelia-diff.sh}" mkdir -p "$TMP_DIR" RESULTS_FILE="$TMP_DIR/results.$$" : > "$RESULTS_FILE" cleanup() { rm -f "$RESULTS_FILE" } trap cleanup EXIT json_escape() { sed \ -e 's/\\/\\\\/g' \ -e 's/"/\\"/g' \ -e 's/\t/\\t/g' } add_result() { local severity="$1" local name="$2" local message="$3" printf '%s\t%s\t%s\n' "$severity" "$name" "$message" >> "$RESULTS_FILE" } need_cmd() { if ! command -v "$1" >/dev/null 2>&1; then add_result "warning" "command_$1" "Command missing: $1" return 1 fi } check_fstype() { local path="$1" local expected="$2" local severity="$3" local name="$4" local actual if ! command -v findmnt >/dev/null 2>&1; then add_result "warning" "$name" "Cannot check $path filesystem because findmnt is missing" return fi if ! actual="$(findmnt -no FSTYPE "$path" 2>/dev/null)"; then add_result "$severity" "$name" "Mount not found: $path" return fi if [ "$actual" = "$expected" ]; then add_result "ok" "$name" "$path filesystem is $actual" else add_result "$severity" "$name" "$path filesystem is $actual, expected $expected" fi } check_disk1_fstype() { local actual if ! command -v findmnt >/dev/null 2>&1; then add_result "warning" "disk1_fstype" "Cannot check /mnt/disk1 filesystem because findmnt is missing" return fi if ! actual="$(findmnt -no FSTYPE "/mnt/disk1" 2>/dev/null)"; then add_result "warning" "disk1_fstype" "Mount not found: /mnt/disk1" return fi if [ "$ALLOW_DISK1_NTFS" = "1" ]; then if [ "$actual" = "ntfs3" ] || [ "$actual" = "fuseblk" ]; then add_result "ok" "disk1_fstype" "/mnt/disk1 filesystem is $actual; temporarily allowed until Disk1 phase 2 migration" else add_result "warning" "disk1_fstype" "/mnt/disk1 filesystem is $actual, expected ntfs3/fuseblk during temporary Disk1 migration exception" fi else if [ "$actual" = "xfs" ]; then add_result "ok" "disk1_fstype" "/mnt/disk1 filesystem is $actual" else add_result "critical" "disk1_fstype" "/mnt/disk1 filesystem is $actual, expected xfs" fi fi } check_no_ntfs_on_core_mounts() { local hits local pattern="^/mnt/(cache|disk1)(/|$)" if ! command -v findmnt >/dev/null 2>&1; then add_result "warning" "no_ntfs_core_mounts" "Cannot check NTFS mounts because findmnt is missing" return fi if [ "$ALLOW_DISK1_NTFS" = "1" ]; then pattern="^/mnt/cache(/|$)" fi hits="$(findmnt -rn -o TARGET,FSTYPE 2>/dev/null | awk -v pattern="$pattern" '$1 ~ pattern && ($2 == "ntfs3" || $2 == "fuseblk") { print $1 ":" $2 }' | paste -sd ',' -)" if [ -n "$hits" ]; then add_result "critical" "no_ntfs_core_mounts" "NTFS-like filesystem on core mount: $hits" elif [ "$ALLOW_DISK1_NTFS" = "1" ]; then add_result "ok" "no_ntfs_core_mounts" "No NTFS on /mnt/cache; /mnt/disk1 NTFS is temporarily allowed until Disk1 phase 2 migration" else add_result "ok" "no_ntfs_core_mounts" "No ntfs3/fuseblk mounts below /mnt/cache or /mnt/disk1" fi } check_mover_drift() { local path="/mnt/disk1/appdata" if [ ! -d "$path" ]; then add_result "ok" "mover_drift_appdata" "$path does not exist" return fi if find "$path" -mindepth 1 -print -quit | grep -q .; then add_result "critical" "mover_drift_appdata" "$path contains entries; appdata should stay cache-only" else add_result "ok" "mover_drift_appdata" "$path is empty" fi } check_inode_usage() { local path="$1" local max_percent="$2" local name="$3" local use_percent if ! use_percent="$(df -Pi "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"; then add_result "warning" "$name" "Cannot read inode usage for $path" return fi if ! printf '%s' "$use_percent" | grep -Eq '^[0-9]+$'; then add_result "warning" "$name" "$path inode usage unavailable (${use_percent:-unknown})" return fi if [ "$use_percent" -lt "$max_percent" ]; then add_result "ok" "$name" "$path inode usage ${use_percent}%" else add_result "warning" "$name" "$path inode usage ${use_percent}% >= ${max_percent}%" fi } check_disk1_inode_usage() { if [ "$ALLOW_DISK1_NTFS" = "1" ]; then add_result "ok" "disk1_inode_usage" "/mnt/disk1 inode usage skipped; NTFS transition filesystem does not expose POSIX inode usage" return fi check_inode_usage "/mnt/disk1" 80 "disk1_inode_usage" } check_filesystem_usage() { local path="$1" local max_percent="$2" local name="$3" local severity="$4" local use_percent if ! use_percent="$(df -P "$path" 2>/dev/null | awk 'NR==2 { gsub("%", "", $5); print $5 }')"; then add_result "warning" "$name" "Cannot read filesystem usage for $path" return fi if [ "$use_percent" -lt "$max_percent" ]; then add_result "ok" "$name" "$path usage ${use_percent}%" else add_result "$severity" "$name" "$path usage ${use_percent}% >= ${max_percent}%" fi } check_nvme_smart() { local device="${NVME_DEVICE:-/dev/nvme0n1}" local smart local warning local percentage_used local media_errors if ! need_cmd nvme; then return fi if ! smart="$(nvme smart-log "$device" 2>/dev/null)"; then add_result "critical" "nvme_smart" "Cannot read nvme smart-log for $device" return fi warning="$(printf '%s\n' "$smart" | awk -F: '/critical_warning/ { gsub(/[[:space:]]/, "", $2); print $2; exit }')" percentage_used="$(printf '%s\n' "$smart" | awk -F: '/percentage_used/ { gsub(/[^0-9]/, "", $2); print $2; exit }')" media_errors="$(printf '%s\n' "$smart" | awk -F: '/media_errors/ { gsub(/[^0-9]/, "", $2); print $2; exit }')" if [ "${warning:-0}" = "0" ] || [ "${warning:-0}" = "0x00" ]; then add_result "ok" "nvme_critical_warning" "$device critical_warning ${warning:-0}" else add_result "critical" "nvme_critical_warning" "$device critical_warning ${warning}" fi if [ -n "${percentage_used:-}" ] && [ "$percentage_used" -lt 80 ]; then add_result "ok" "nvme_percentage_used" "$device percentage_used ${percentage_used}%" else add_result "critical" "nvme_percentage_used" "$device percentage_used ${percentage_used:-unknown}, expected <80" fi if [ "${media_errors:-0}" = "0" ]; then add_result "ok" "nvme_media_errors" "$device media_errors 0" else add_result "warning" "nvme_media_errors" "$device media_errors ${media_errors}" fi } check_authelia_config_drift() { if [ "$SKIP_AUTHELIA_DRIFT" = "1" ]; then add_result "ok" "authelia_config_drift" "Authelia drift check skipped via SKIP_AUTHELIA_DRIFT=1" return fi if [ ! -x "$AUTHELIA_DIFF_SCRIPT" ] && [ ! -f "$AUTHELIA_DIFF_SCRIPT" ]; then add_result "warning" "authelia_config_drift" "Authelia diff script missing: $AUTHELIA_DIFF_SCRIPT" return fi local output local rc output="$(bash "$AUTHELIA_DIFF_SCRIPT" 2>&1)" rc=$? case "$rc" in 0) add_result "ok" "authelia_config_drift" "Authelia repo baseline matches host config (access_control)" ;; 1) add_result "warning" "authelia_config_drift" "Authelia repo<->host drift in access_control; run authelia-diff.sh for details" ;; 2) add_result "warning" "authelia_config_drift" "Authelia diff aborted: $output" ;; 3) add_result "warning" "authelia_config_drift" "Authelia diff: section missing in repo or host: $output" ;; *) add_result "warning" "authelia_config_drift" "Authelia diff returned unexpected rc=$rc: $output" ;; esac } send_ntfy() { local severity="$1" local topic="$2" local body="$3" if [ "$SEND_NTFY" != "1" ]; then return fi if command -v curl >/dev/null 2>&1; then printf '%s\n' "$body" | curl -fsS \ -H "Title: KalliLab posture-check $severity" \ -H "Priority: high" \ --data-binary @- \ "$NTFY_BASE_URL/$topic" >/dev/null || true fi } alert_fingerprint() { awk -F '\t' '$1 != "ok" { printf "%s|%s|%s\n", $1, $2, $3 }' "$RESULTS_FILE" | cksum | awk '{ print $1 ":" $2 }' } alert_summary() { awk -F '\t' '$1 != "ok" { printf "%s:%s; ", $1, $2 }' "$RESULTS_FILE" | sed 's/; $//' } should_send_alert() { local fingerprint="$1" local now local last_fingerprint="" local last_sent="0" now="$(date +%s)" if ! printf '%s' "$ALERT_REPEAT_SECONDS" | grep -Eq '^[0-9]+$'; then ALERT_REPEAT_SECONDS=86400 fi if [ -f "$ALERT_STATE_PATH" ]; then IFS="$(printf '\t')" read -r last_fingerprint last_sent < "$ALERT_STATE_PATH" || true fi if [ "$fingerprint" != "$last_fingerprint" ]; then return 0 fi if ! printf '%s' "$last_sent" | grep -Eq '^[0-9]+$'; then return 0 fi if [ $((now - last_sent)) -ge "$ALERT_REPEAT_SECONDS" ]; then return 0 fi return 1 } remember_alert() { local fingerprint="$1" local now now="$(date +%s)" mkdir -p "$(dirname "$ALERT_STATE_PATH")" printf '%s\t%s\n' "$fingerprint" "$now" > "$ALERT_STATE_PATH.tmp" mv "$ALERT_STATE_PATH.tmp" "$ALERT_STATE_PATH" } clear_alert_state() { rm -f "$ALERT_STATE_PATH" "$ALERT_STATE_PATH.tmp" } send_alert_once() { local severity="$1" local topic="$2" local body="$3" local fingerprint local summary fingerprint="$(alert_fingerprint)" summary="$(alert_summary)" if [ -n "$summary" ]; then body="$body Checks: $summary" fi if should_send_alert "$fingerprint"; then send_ntfy "$severity" "$topic" "$body" remember_alert "$fingerprint" fi } write_json() { local timestamp local critical_count local warning_count local status local first=1 timestamp="$(date -Iseconds)" critical_count="$(awk -F '\t' '$1 == "critical" { count++ } END { print count + 0 }' "$RESULTS_FILE")" warning_count="$(awk -F '\t' '$1 == "warning" { count++ } END { print count + 0 }' "$RESULTS_FILE")" if [ "$critical_count" -gt 0 ]; then status="critical" elif [ "$warning_count" -gt 0 ]; then status="warning" else status="ok" fi mkdir -p "$(dirname "$OUTPUT_PATH")" { printf '{\n' printf ' "timestamp": "%s",\n' "$(printf '%s' "$timestamp" | json_escape)" printf ' "status": "%s",\n' "$status" printf ' "critical_count": %s,\n' "$critical_count" printf ' "warning_count": %s,\n' "$warning_count" printf ' "checks": [\n' while IFS="$(printf '\t')" read -r severity name message; do if [ "$first" -eq 0 ]; then printf ',\n' fi first=0 printf ' {"severity":"%s","name":"%s","message":"%s"}' \ "$(printf '%s' "$severity" | json_escape)" \ "$(printf '%s' "$name" | json_escape)" \ "$(printf '%s' "$message" | json_escape)" done < "$RESULTS_FILE" printf '\n ]\n' printf '}\n' } > "$OUTPUT_PATH.tmp" mv "$OUTPUT_PATH.tmp" "$OUTPUT_PATH" cat "$OUTPUT_PATH" if [ "$status" = "critical" ]; then send_alert_once "critical" "$CRITICAL_TOPIC" "Posture-check critical: $critical_count critical, $warning_count warning. See $OUTPUT_PATH" return 2 fi if [ "$status" = "warning" ]; then send_alert_once "warning" "$WARNING_TOPIC" "Posture-check warning: $warning_count warning. See $OUTPUT_PATH" return 1 fi clear_alert_state } main() { need_cmd findmnt || true need_cmd df || true need_cmd awk || true check_fstype "/mnt/cache" "xfs" "critical" "cache_fstype" check_disk1_fstype check_no_ntfs_on_core_mounts check_mover_drift check_inode_usage "/mnt/cache" 80 "cache_inode_usage" check_disk1_inode_usage check_filesystem_usage "/mnt/cache" 70 "cache_fill_level" "warning" for share in appdata system domains; do if [ -e "/mnt/user/$share" ]; then check_filesystem_usage "/mnt/user/$share" 70 "share_${share}_fill_level" "warning" else add_result "warning" "share_${share}_fill_level" "/mnt/user/$share missing" fi done check_nvme_smart check_authelia_config_drift write_json } main "$@"