ops: prepare docker critical events watcher

This commit is contained in:
2026-06-05 22:25:23 +02:00
parent bc3ecad45a
commit 2f3d184a3b
6 changed files with 225 additions and 9 deletions
+142
View File
@@ -0,0 +1,142 @@
#!/usr/bin/env bash
set -euo pipefail
BASE_DIR="${BASE_DIR:-/mnt/user/services/posture-check}"
WATCHER_SCRIPT="${WATCHER_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/docker-critical-events.sh}"
PID_FILE="${PID_FILE:-$BASE_DIR/docker-critical-events.pid}"
OUT_FILE="${OUT_FILE:-$BASE_DIR/docker-critical-events.out}"
EVENT_LOG="${EVENT_LOG:-$BASE_DIR/docker-critical-events-last.log}"
NTFY_SCRIPT="${NTFY_SCRIPT:-/mnt/user/services/homelab-infra/ops/restore-tests/send-ntfy.sh}"
NTFY_TOPIC="${NTFY_TOPIC:-homelab-alerts}"
usage() {
cat >&2 <<EOF
Usage: $0 start|stop|restart|status|smoke
start Start Docker critical-events watcher in the background.
stop Stop the watcher by pidfile.
restart Stop and start the watcher.
status Print watcher status and recent log tail.
smoke Send one ntfy test message through the same alert path.
EOF
}
is_running() {
[ -s "$PID_FILE" ] || return 1
local pid
pid="$(cat "$PID_FILE")"
[ -n "$pid" ] || return 1
kill -0 "$pid" >/dev/null 2>&1
}
start_watcher() {
mkdir -p "$BASE_DIR"
if is_running; then
echo "docker-critical-events watcher already running (pid $(cat "$PID_FILE"))"
return 0
fi
if [ ! -r "$WATCHER_SCRIPT" ]; then
echo "Watcher script not readable: $WATCHER_SCRIPT" >&2
return 1
fi
NTFY_SCRIPT="$NTFY_SCRIPT" \
NTFY_TOPIC="$NTFY_TOPIC" \
OUTPUT_PATH="$EVENT_LOG" \
nohup bash "$WATCHER_SCRIPT" >"$OUT_FILE" 2>&1 </dev/null &
echo "$!" > "$PID_FILE"
sleep 1
if is_running; then
echo "docker-critical-events watcher started (pid $(cat "$PID_FILE"))"
else
echo "docker-critical-events watcher failed to stay running; see $OUT_FILE" >&2
return 1
fi
}
stop_watcher() {
if ! is_running; then
rm -f "$PID_FILE"
echo "docker-critical-events watcher is not running"
return 0
fi
local pid
pid="$(cat "$PID_FILE")"
kill "$pid" >/dev/null 2>&1 || true
sleep 1
if kill -0 "$pid" >/dev/null 2>&1; then
echo "watcher still running after SIGTERM; sending SIGKILL"
kill -9 "$pid" >/dev/null 2>&1 || true
fi
rm -f "$PID_FILE"
echo "docker-critical-events watcher stopped"
}
status_watcher() {
if is_running; then
echo "status=running pid=$(cat "$PID_FILE")"
else
echo "status=stopped"
[ -e "$PID_FILE" ] && echo "stale_pidfile=$PID_FILE"
fi
echo "watcher_script=$WATCHER_SCRIPT"
echo "event_log=$EVENT_LOG"
echo "out_file=$OUT_FILE"
if [ -s "$EVENT_LOG" ]; then
echo
echo "Recent critical events:"
tail -n 20 "$EVENT_LOG"
fi
if [ -s "$OUT_FILE" ]; then
echo
echo "Recent watcher output:"
tail -n 20 "$OUT_FILE"
fi
}
smoke_ntfy() {
if [ ! -r "$NTFY_SCRIPT" ]; then
echo "ntfy helper not readable: $NTFY_SCRIPT" >&2
return 1
fi
bash "$NTFY_SCRIPT" \
"$NTFY_TOPIC" \
"Docker critical watcher smoke" \
"Smoke test from $(hostname) at $(date -Iseconds). No container was stopped." \
default
echo "smoke notification sent to $NTFY_TOPIC"
}
case "${1:-}" in
start)
start_watcher
;;
stop)
stop_watcher
;;
restart)
stop_watcher
start_watcher
;;
status)
status_watcher
;;
smoke)
smoke_ntfy
;;
*)
usage
exit 2
;;
esac