ops: prepare docker critical events watcher

This commit is contained in:
2026-06-05 22:25:23 +02:00
parent bc3ecad45a
commit 2f3d184a3b
6 changed files with 225 additions and 9 deletions
+142
View File
@@ -0,0 +1,142 @@
#!/usr/bin/env bash
set -euo pipefail
BASE_DIR="${BASE_DIR:-/mnt/user/services/posture-check}"
WATCHER_SCRIPT="${WATCHER_SCRIPT:-/mnt/user/services/homelab-infra/services/posture-check/docker-critical-events.sh}"
PID_FILE="${PID_FILE:-$BASE_DIR/docker-critical-events.pid}"
OUT_FILE="${OUT_FILE:-$BASE_DIR/docker-critical-events.out}"
EVENT_LOG="${EVENT_LOG:-$BASE_DIR/docker-critical-events-last.log}"
NTFY_SCRIPT="${NTFY_SCRIPT:-/mnt/user/services/homelab-infra/ops/restore-tests/send-ntfy.sh}"
NTFY_TOPIC="${NTFY_TOPIC:-homelab-alerts}"
usage() {
cat >&2 <<EOF
Usage: $0 start|stop|restart|status|smoke
start Start Docker critical-events watcher in the background.
stop Stop the watcher by pidfile.
restart Stop and start the watcher.
status Print watcher status and recent log tail.
smoke Send one ntfy test message through the same alert path.
EOF
}
is_running() {
[ -s "$PID_FILE" ] || return 1
local pid
pid="$(cat "$PID_FILE")"
[ -n "$pid" ] || return 1
kill -0 "$pid" >/dev/null 2>&1
}
start_watcher() {
mkdir -p "$BASE_DIR"
if is_running; then
echo "docker-critical-events watcher already running (pid $(cat "$PID_FILE"))"
return 0
fi
if [ ! -r "$WATCHER_SCRIPT" ]; then
echo "Watcher script not readable: $WATCHER_SCRIPT" >&2
return 1
fi
NTFY_SCRIPT="$NTFY_SCRIPT" \
NTFY_TOPIC="$NTFY_TOPIC" \
OUTPUT_PATH="$EVENT_LOG" \
nohup bash "$WATCHER_SCRIPT" >"$OUT_FILE" 2>&1 </dev/null &
echo "$!" > "$PID_FILE"
sleep 1
if is_running; then
echo "docker-critical-events watcher started (pid $(cat "$PID_FILE"))"
else
echo "docker-critical-events watcher failed to stay running; see $OUT_FILE" >&2
return 1
fi
}
stop_watcher() {
if ! is_running; then
rm -f "$PID_FILE"
echo "docker-critical-events watcher is not running"
return 0
fi
local pid
pid="$(cat "$PID_FILE")"
kill "$pid" >/dev/null 2>&1 || true
sleep 1
if kill -0 "$pid" >/dev/null 2>&1; then
echo "watcher still running after SIGTERM; sending SIGKILL"
kill -9 "$pid" >/dev/null 2>&1 || true
fi
rm -f "$PID_FILE"
echo "docker-critical-events watcher stopped"
}
status_watcher() {
if is_running; then
echo "status=running pid=$(cat "$PID_FILE")"
else
echo "status=stopped"
[ -e "$PID_FILE" ] && echo "stale_pidfile=$PID_FILE"
fi
echo "watcher_script=$WATCHER_SCRIPT"
echo "event_log=$EVENT_LOG"
echo "out_file=$OUT_FILE"
if [ -s "$EVENT_LOG" ]; then
echo
echo "Recent critical events:"
tail -n 20 "$EVENT_LOG"
fi
if [ -s "$OUT_FILE" ]; then
echo
echo "Recent watcher output:"
tail -n 20 "$OUT_FILE"
fi
}
smoke_ntfy() {
if [ ! -r "$NTFY_SCRIPT" ]; then
echo "ntfy helper not readable: $NTFY_SCRIPT" >&2
return 1
fi
bash "$NTFY_SCRIPT" \
"$NTFY_TOPIC" \
"Docker critical watcher smoke" \
"Smoke test from $(hostname) at $(date -Iseconds). No container was stopped." \
default
echo "smoke notification sent to $NTFY_TOPIC"
}
case "${1:-}" in
start)
start_watcher
;;
stop)
stop_watcher
;;
restart)
stop_watcher
start_watcher
;;
status)
status_watcher
;;
smoke)
smoke_ntfy
;;
*)
usage
exit 2
;;
esac
@@ -0,0 +1,55 @@
#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
WATCHER="$SCRIPT_DIR/../docker-critical-events.sh"
if [ ! -r "$WATCHER" ]; then
echo "FAIL: watcher not readable at $WATCHER" >&2
exit 1
fi
tmp="$(mktemp -d)"
trap 'rm -rf "$tmp"' EXIT
mkdir -p "$tmp/bin"
cat > "$tmp/bin/docker" <<'EOF'
#!/usr/bin/env bash
if [ "${1:-}" != "events" ]; then
echo "unexpected docker command: $*" >&2
exit 1
fi
cat <<'EVENTS'
{"Type":"container","Action":"die","Actor":{"Attributes":{"name":"ok-container","image":"example:latest","exitCode":"0"}}}
{"Type":"container","Action":"die","Actor":{"Attributes":{"name":"bad-container","image":"example:latest","exitCode":"137"}}}
{"Type":"container","Action":"oom","Actor":{"Attributes":{"name":"oom-container","image":"example:latest"}}}
EVENTS
EOF
chmod +x "$tmp/bin/docker"
PATH="$tmp/bin:$PATH" \
SEND_NTFY=0 \
OUTPUT_PATH="$tmp/events.log" \
bash "$WATCHER"
fail() {
echo "FAIL: $*" >&2
echo "--- events.log ---" >&2
cat "$tmp/events.log" >&2 || true
exit 1
}
[ -s "$tmp/events.log" ] || fail "expected critical event log to be written"
if grep -q 'ok-container' "$tmp/events.log"; then
fail "exitCode 0 die event should not alert"
fi
grep -q 'bad-container' "$tmp/events.log" || fail "non-zero die event missing"
grep -q 'oom-container' "$tmp/events.log" || fail "oom event missing"
line_count="$(wc -l < "$tmp/events.log" | tr -d ' ')"
[ "$line_count" = "2" ] || fail "expected 2 logged critical events, got $line_count"
echo "OK - docker critical events filter test passed"
+22 -5
View File
@@ -93,12 +93,29 @@ bash /mnt/user/services/homelab-infra/services/posture-check/daily-status-report
## `docker-critical-events-at-start`
Zeit: Array Start. Dieser Job startet einen Hintergrund-Watcher und beendet sich sofort.
Zeit: Array Start. Dieser Job startet einen Hintergrund-Watcher und beendet sich
sofort. Der Supervisor schreibt PID, stdout/stderr und Event-Log nach
`/mnt/user/services/posture-check/`.
```bash
#!/bin/bash
ps -ef | grep -F -- "docker events --filter event=die --filter event=oom --filter event=kill" | grep -v grep >/dev/null && exit 0
mkdir -p /mnt/user/services/posture-check
nohup bash /mnt/user/services/homelab-infra/services/posture-check/docker-critical-events.sh >/mnt/user/services/posture-check/docker-critical-events.out 2>&1 </dev/null &
exit 0
exec /mnt/user/services/homelab-infra/services/posture-check/docker-critical-events-supervisor.sh start
```
Status pruefen:
```bash
/mnt/user/services/homelab-infra/services/posture-check/docker-critical-events-supervisor.sh status
```
Stoppen:
```bash
/mnt/user/services/homelab-infra/services/posture-check/docker-critical-events-supervisor.sh stop
```
ntfy-Smoke-Test ohne Container-Stopp:
```bash
/mnt/user/services/homelab-infra/services/posture-check/docker-critical-events-supervisor.sh smoke
```