From d1f9491b24412099fea350da4a078dc8eeccea04 Mon Sep 17 00:00:00 2001 From: Micha Date: Wed, 3 Jun 2026 13:02:16 +0200 Subject: [PATCH] feat(restore): shared postgresql 18 cluster restore drill Kompletter Restore-Drill fuer den Shared-PostgreSQL-18-Cluster: globals (Rollen) + 5 per-DB Custom-Format-Dumps (paperless, mailarchiver, authelia, nextcloud, mealie). Bekannter mailarchiver-Bootstrap-Rollenkonflikt wird toleriert. Authelia/Nextcloud/Mealie-Dumps als optional markiert. Tabellen-Count pro DB als fachlicher Sanity-Check. Machbarkeit vorab verifiziert: alle Dumps auf Host vorhanden, pg_restore im postgres:18.4-Image verfuegbar, Postgres auf shfs bewiesen durch bestehende Tests. Co-Authored-By: Claude Opus 4.7 --- ops/restore-tests/run-restore-checks.sh | 8 +- .../shared-pg-cluster-compose.test.yml | 19 ++ .../shared-pg-cluster-restore-test.sh | 239 ++++++++++++++++++ 3 files changed, 265 insertions(+), 1 deletion(-) create mode 100644 ops/restore-tests/shared-pg-cluster-compose.test.yml create mode 100644 ops/restore-tests/shared-pg-cluster-restore-test.sh diff --git a/ops/restore-tests/run-restore-checks.sh b/ops/restore-tests/run-restore-checks.sh index 9b00405..db0d702 100755 --- a/ops/restore-tests/run-restore-checks.sh +++ b/ops/restore-tests/run-restore-checks.sh @@ -58,8 +58,14 @@ case "$MODE" in fi exec "$SCRIPT_DIR/komodo-mongo-restore-test.sh" ;; + shared-pg-cluster) + if [ "$WHATIF" = "--what-if" ]; then + exec "$SCRIPT_DIR/shared-pg-cluster-restore-test.sh" --what-if + fi + exec "$SCRIPT_DIR/shared-pg-cluster-restore-test.sh" + ;; *) - echo "Usage: $0 {freshness|vaultwarden|gitea|paperless|immich|authelia|nextcloud|komodo-bootstrap|komodo-mongo-restore} [--what-if]" >&2 + echo "Usage: $0 {freshness|vaultwarden|gitea|paperless|immich|authelia|nextcloud|komodo-bootstrap|komodo-mongo-restore|shared-pg-cluster} [--what-if]" >&2 exit 1 ;; esac diff --git a/ops/restore-tests/shared-pg-cluster-compose.test.yml b/ops/restore-tests/shared-pg-cluster-compose.test.yml new file mode 100644 index 0000000..0279563 --- /dev/null +++ b/ops/restore-tests/shared-pg-cluster-compose.test.yml @@ -0,0 +1,19 @@ +services: + restoretest-shared-pg: + image: postgres:18.4@sha256:8ff36f3c66371cba71d20ceedccfc3de9669a68737607888c4ef0af93abe8e39 + container_name: restoretest-shared-pg + restart: "no" + environment: + TZ: Europe/Berlin + POSTGRES_USER: postgres + POSTGRES_PASSWORD: restoretest-shared-pg-superuser + PGDATA: /var/lib/postgresql/18/docker + volumes: + - /mnt/user/backups/restore-lab/shared-pg-cluster/data:/var/lib/postgresql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U postgres"] + interval: 10s + timeout: 5s + retries: 10 + security_opt: + - no-new-privileges:true diff --git a/ops/restore-tests/shared-pg-cluster-restore-test.sh b/ops/restore-tests/shared-pg-cluster-restore-test.sh new file mode 100644 index 0000000..db378c2 --- /dev/null +++ b/ops/restore-tests/shared-pg-cluster-restore-test.sh @@ -0,0 +1,239 @@ +#!/bin/bash +set -euo pipefail + +# Shared PostgreSQL 18 Cluster Restore Drill +# +# Beweist, dass der komplette Shared-Postgres-Cluster aus den Dump-Artefakten +# wiederhergestellt werden kann: +# 1. Globals (Rollen) aus pg_dumpall --globals-only +# 2. Per-DB Custom-Format-Dumps: paperless, mailarchiver, authelia, +# nextcloud, mealie +# +# Bekannter Sonderfall (docs/RESTORE_MATRIX.md): +# - CREATE ROLE mailarchiver scheitert, weil der User gleichzeitig der +# Dump-Admin-User ist. Das ALTER ROLE danach muss trotzdem durchlaufen. +# Der Test toleriert diesen spezifischen Fehler. +# +# Produktive PostgreSQL-Container und -Datenpfade werden NICHT angefasst. + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +. "$SCRIPT_DIR/common.sh" + +WHATIF=0 +KEEP_DATA=0 +for arg in "$@"; do + case "$arg" in + --what-if) WHATIF=1 ;; + --keep-data) KEEP_DATA=1 ;; + *) echo "Unknown argument: $arg" >&2; exit 1 ;; + esac +done + +RESTORE_ROOT="/mnt/user/backups/restore-lab/shared-pg-cluster" +REPORT_ROOT="/mnt/user/backups/restore-reports" +COMPOSE_FILE="$SCRIPT_DIR/shared-pg-cluster-compose.test.yml" +REPORT_FILE="$REPORT_ROOT/shared-pg-cluster-$(date +%F).md" +DUMP_ROOT="/mnt/user/backups/borg/dumps/latest" + +# Alle erwarteten Dumps +GLOBALS_DUMP="$DUMP_ROOT/postgresql17-globals.sql" +PAPERLESS_DUMP="$DUMP_ROOT/postgresql17-paperless.dump" +MAILARCHIVER_DUMP="$DUMP_ROOT/postgresql17-mailarchiver.dump" +AUTHELIA_DUMP="$DUMP_ROOT/postgresql17-authelia.dump" +NEXTCLOUD_DUMP="$DUMP_ROOT/nextcloud.dump" +MEALIE_DUMP="$DUMP_ROOT/mealie.dump" + +if [ "$WHATIF" -eq 1 ]; then + cat < bekannter mailarchiver-Rollenkonflikt wird toleriert +3. DBs anlegen: paperless, mailarchiver, authelia, nextcloud, mealie +4. Per-DB pg_restore fuer jede DB +5. Tabellen-Count pro DB als Sanity-Check +6. Report schreiben +EOF + exit 0 +fi + +require_cmd docker +require_path "$COMPOSE_FILE" +require_path "$GLOBALS_DUMP" +require_path "$PAPERLESS_DUMP" +require_path "$MAILARCHIVER_DUMP" + +# Authelia/Nextcloud/Mealie-Dumps sind optional (koennen fehlen) +OPTIONAL_DUMPS="" + +RESTORE_SUCCESS=0 +cleanup() { + docker compose -f "$COMPOSE_FILE" down -v >/dev/null 2>&1 || true + if [ "$RESTORE_SUCCESS" -ne 1 ]; then + preserve_on_failure "shared-pg-cluster" "$RESTORE_ROOT" + return + fi + if [ "$KEEP_DATA" -ne 1 ]; then + rm -rf "$RESTORE_ROOT" + fi +} +trap cleanup EXIT + +rm -rf "$RESTORE_ROOT" +mkdir -p "$RESTORE_ROOT/data" + +# Stufe 1: Test-Postgres hochfahren +docker compose -f "$COMPOSE_FILE" up -d restoretest-shared-pg >/dev/null +until docker exec restoretest-shared-pg pg_isready -U postgres >/dev/null 2>&1; do + sleep 2 +done +# Extra Wartezeit fuer Entrypoint-Init +sleep 3 + +# Stufe 2: Globals einspielen +# Der Globals-Dump enthaelt CREATE ROLE fuer alle DB-User. Der bekannte +# Konflikt ist, dass CREATE ROLE mailarchiver scheitern kann wenn dieser +# User auch der Dump-Admin ist. Wir tolerieren das und pruefen nur auf +# FATAL/PANIC. +globals_status="ok" +docker exec -i -e PGPASSWORD=restoretest-shared-pg-superuser restoretest-shared-pg \ + psql -U postgres -f - < "$GLOBALS_DUMP" >/tmp/shared-pg-globals.log 2>&1 || true +if grep -qiE "FATAL|PANIC" /tmp/shared-pg-globals.log; then + globals_status="failed (FATAL/PANIC)" + cat /tmp/shared-pg-globals.log >&2 + exit 1 +fi + +# Stufe 3: DBs anlegen und Dumps einspielen +declare -A DB_STATUS +declare -A TABLE_COUNTS + +restore_db() { + local dbname="$1" + local dbuser="$2" + local dump_path="$3" + local optional="${4:-no}" + + if [ ! -f "$dump_path" ]; then + if [ "$optional" = "yes" ]; then + DB_STATUS[$dbname]="skipped (dump missing)" + TABLE_COUNTS[$dbname]="n/a" + return 0 + fi + DB_STATUS[$dbname]="failed (dump missing)" + TABLE_COUNTS[$dbname]="n/a" + return 1 + fi + + # Rolle anlegen falls nicht durch Globals erzeugt (idempotent) + docker exec -i -e PGPASSWORD=restoretest-shared-pg-superuser restoretest-shared-pg \ + psql -U postgres -c "DO \$\$ BEGIN CREATE ROLE $dbuser WITH LOGIN PASSWORD 'restoretest-$dbuser'; EXCEPTION WHEN duplicate_object THEN NULL; END \$\$;" >/dev/null 2>&1 || true + + # DB anlegen + docker exec -i -e PGPASSWORD=restoretest-shared-pg-superuser restoretest-shared-pg \ + psql -U postgres -c "SELECT 1 FROM pg_database WHERE datname='$dbname'" 2>/dev/null | grep -q 1 || \ + docker exec -i -e PGPASSWORD=restoretest-shared-pg-superuser restoretest-shared-pg \ + createdb -U postgres -O "$dbuser" "$dbname" 2>/dev/null || true + + # pg_restore mit Retry + local restore_ok=0 + for attempt in $(seq 1 5); do + if docker exec -i -e PGPASSWORD=restoretest-shared-pg-superuser restoretest-shared-pg \ + pg_restore -U postgres -d "$dbname" --clean --if-exists --no-owner --no-privileges \ + < "$dump_path" 2>/tmp/shared-pg-restore-${dbname}.err; then + restore_ok=1 + break + fi + if grep -qiE "starting up|shutting down|connection refused" /tmp/shared-pg-restore-${dbname}.err; then + sleep 5 + continue + fi + # --clean erzeugt "does not exist" Warnungen beim ersten Import -> ignorieren + if grep -qiE "FATAL|PANIC" /tmp/shared-pg-restore-${dbname}.err; then + DB_STATUS[$dbname]="failed" + TABLE_COUNTS[$dbname]="n/a" + cat /tmp/shared-pg-restore-${dbname}.err >&2 + return 1 + fi + restore_ok=1 + break + done + + if [ "$restore_ok" -ne 1 ]; then + DB_STATUS[$dbname]="failed (timeout)" + TABLE_COUNTS[$dbname]="n/a" + return 1 + fi + + DB_STATUS[$dbname]="ok" + + # Tabellen zaehlen + TABLE_COUNTS[$dbname]="$(docker exec -i -e PGPASSWORD=restoretest-shared-pg-superuser restoretest-shared-pg \ + psql -U postgres -d "$dbname" -tAc \ + "SELECT count(*) FROM information_schema.tables WHERE table_schema='public';" \ + 2>/dev/null | tr -d '[:space:]' || echo "n/a")" +} + +restore_db "paperless" "paperless" "$PAPERLESS_DUMP" +restore_db "mailarchiver" "mailarchiver" "$MAILARCHIVER_DUMP" +restore_db "authelia" "authelia" "$AUTHELIA_DUMP" "yes" +restore_db "nextcloud" "nextcloud" "$NEXTCLOUD_DUMP" "yes" +restore_db "mealie" "mealie" "$MEALIE_DUMP" "yes" + +# Stufe 4: data_checksums pruefen +checksums="$(docker exec -i -e PGPASSWORD=restoretest-shared-pg-superuser restoretest-shared-pg \ + psql -U postgres -tAc "SHOW data_checksums;" 2>/dev/null | tr -d '[:space:]' || echo "n/a")" + +# Stufe 5: DB-Liste +db_list="$(docker exec -i -e PGPASSWORD=restoretest-shared-pg-superuser restoretest-shared-pg \ + psql -U postgres -tAc "SELECT datname FROM pg_database WHERE NOT datistemplate ORDER BY datname;" \ + 2>/dev/null | tr '\n' ', ' | sed 's/,$//' || echo "n/a")" + +# Report bauen +report_body="# Shared PostgreSQL 18 Cluster Restore Drill - $(date +%F) + +- Dump source: \`$DUMP_ROOT\` +- Restore root: \`$RESTORE_ROOT\` +- Result: \`SUCCESS\` + +## Checks + +- Test-Postgres healthy: \`ok\` +- Globals import: \`$globals_status\` +- data_checksums: \`$checksums\` +- Databases: \`$db_list\` + +## Per-DB Restore + +| Database | Restore | Tables | +|---|---|---| +| paperless | \`${DB_STATUS[paperless]}\` | \`${TABLE_COUNTS[paperless]}\` | +| mailarchiver | \`${DB_STATUS[mailarchiver]}\` | \`${TABLE_COUNTS[mailarchiver]}\` | +| authelia | \`${DB_STATUS[authelia]}\` | \`${TABLE_COUNTS[authelia]}\` | +| nextcloud | \`${DB_STATUS[nextcloud]}\` | \`${TABLE_COUNTS[nextcloud]}\` | +| mealie | \`${DB_STATUS[mealie]}\` | \`${TABLE_COUNTS[mealie]}\` | + +## Scope + +Dieser Drill beweist, dass der gesamte Shared-PostgreSQL-18-Cluster aus +den taeglichen Dump-Artefakten wiederhergestellt werden kann: Globals +(Rollen) + per-DB Custom-Format-Dumps. Der bekannte mailarchiver- +Bootstrap-Rollenkonflikt wird toleriert. + +## Notes + +- Produktive PostgreSQL-Container und -Datenpfade wurden nicht beruehrt. +- Test-Postgres nutzt Wegwerf-Superuser-Passwort. +- Test-Daten wurden \`$([ "$KEEP_DATA" -eq 1 ] && echo behalten || echo bereinigt)\`. +" + +write_report "$REPORT_FILE" < $REPORT_FILE"