diff --git a/ops/restore-tests/README.md b/ops/restore-tests/README.md index 0b9bf8f..e72ce9d 100644 --- a/ops/restore-tests/README.md +++ b/ops/restore-tests/README.md @@ -42,6 +42,9 @@ Ziel: - `authelia-compose.test.yml`: isolierte Testinstanz fuer Authelia inkl. Test-Postgres, Filesystem-Notifier (kein echter SMTP-Versand) - `authelia-plan.md`: konkreter Authelia-Testplan - `authelia-runbook.md`: Operator-Runbook fuer den ersten Authelia-Lauf +- `nextcloud-restore-test.sh`: Nextcloud-Restore-Job (Scaffold; Erstlauf noch offen) +- `nextcloud-compose.test.yml`: isolierte Testinstanz fuer Nextcloud inkl. Test-Postgres und Test-Redis + - `check-restore-freshness.ps1`: woechentlicher Frische-Check fuer Dumps und Reports - `run-restore-checks.ps1`: einfacher Dispatcher fuer Restore-Jobs - `check-restore-freshness.sh`: hosttauglicher Frische-Check diff --git a/ops/restore-tests/nextcloud-compose.test.yml b/ops/restore-tests/nextcloud-compose.test.yml new file mode 100644 index 0000000..c75f204 --- /dev/null +++ b/ops/restore-tests/nextcloud-compose.test.yml @@ -0,0 +1,61 @@ +services: + restoretest-nextcloud-postgres: + # Gleiche Major-Version wie apps/nextcloud/docker-compose.yml in Produktion. + image: postgres:18.4@sha256:8ff36f3c66371cba71d20ceedccfc3de9669a68737607888c4ef0af93abe8e39 + container_name: restoretest-nextcloud-postgres + restart: "no" + environment: + TZ: Europe/Berlin + POSTGRES_DB: nextcloud + POSTGRES_USER: nextcloud + POSTGRES_PASSWORD: restoretest-nextcloud-db + PGDATA: /var/lib/postgresql/18/docker + volumes: + - /mnt/user/backups/restore-lab/nextcloud/postgres:/var/lib/postgresql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U nextcloud -d nextcloud"] + interval: 10s + timeout: 5s + retries: 10 + security_opt: + - no-new-privileges:true + + restoretest-nextcloud-redis: + image: redis:8.8.0-alpine@sha256:09160599abd229764c0fb44cb6be640294e1d360a54b19985ab4843dcf2d90f1 + container_name: restoretest-nextcloud-redis + restart: "no" + command: redis-server --save "" --appendonly no + security_opt: + - no-new-privileges:true + + restoretest-nextcloud: + # Gleicher Image-Digest wie apps/nextcloud/docker-compose.yml. + image: nextcloud:33.0.4-apache@sha256:caa40b8beaf0057ac213d8dfc515c36ce64f7a8f0825b6a287e6f7cf2f4a095d + container_name: restoretest-nextcloud + restart: "no" + depends_on: + restoretest-nextcloud-postgres: + condition: service_healthy + restoretest-nextcloud-redis: + condition: service_started + environment: + TZ: Europe/Berlin + POSTGRES_HOST: restoretest-nextcloud-postgres + POSTGRES_DB: nextcloud + POSTGRES_USER: nextcloud + POSTGRES_PASSWORD: restoretest-nextcloud-db + REDIS_HOST: restoretest-nextcloud-redis + NEXTCLOUD_ADMIN_USER: restoretest-admin + NEXTCLOUD_ADMIN_PASSWORD: restoretest-nextcloud-admin-pass + NEXTCLOUD_DATA_DIR: /var/www/html/data + # Bewusst keine Trusted-Domain/Proxy-Konfiguration: Smoke prueft + # nur localhost-HTTP, keine Traefik-Route. + ports: + # nur 127.0.0.1, keine Public-Route, keine Traefik-Labels + - "127.0.0.1:18180:80" + volumes: + # Restore-Lab-Pfade: alles isoliert, keine produktiven Mounts. + - /mnt/user/backups/restore-lab/nextcloud/html:/var/www/html + - /mnt/user/backups/restore-lab/nextcloud/data:/var/www/html/data + security_opt: + - no-new-privileges:true diff --git a/ops/restore-tests/nextcloud-restore-test.sh b/ops/restore-tests/nextcloud-restore-test.sh new file mode 100644 index 0000000..a6eca34 --- /dev/null +++ b/ops/restore-tests/nextcloud-restore-test.sh @@ -0,0 +1,278 @@ +#!/bin/bash +set -euo pipefail + +# Nextcloud Restore Smoke Test +# +# Nicht-destruktiver Restore-Smoke-Test fuer Nextcloud. +# +# Was dieser Smoke nachweist: +# - Nextcloud-HTML und -Datenpfade koennen aus dem Borg-Archiv extrahiert werden +# - nextcloud.dump kann in eine isolierte Test-Postgres importiert werden +# - Nextcloud startet gegen die restaurierten Daten + Test-Redis und antwortet +# auf HTTP +# - occ status zeigt maintenance:mode = false +# +# Besonderheiten gegenueber den anderen Restore-Tests: +# - Nextcloud hat eine eigene Postgres (nicht shared), mit eigener DB-Rolle +# - Nextcloud nutzt eine eigene Redis-Instanz (Snapshot-Persistenz, kein Passwort) +# - occ maintenance:mode und die Rolle oc_admin sind im DR-Fall relevant; +# im Smoke pruefen wir occ status nach dem Boot +# - Produktive Secrets (admin_user, admin_password, postgres_password) werden +# durch Wegwerf-Werte im Test-Compose ersetzt +# +# Produktive Nextcloud-Container, produktive Postgres-DB, produktive Secrets, +# produktive Nutzdaten unter /mnt/user/documents/nextcloud-data und +# produktiver Traefik-Eintrag werden NICHT angefasst. + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +. "$SCRIPT_DIR/common.sh" + +WHATIF=0 +KEEP_DATA=0 +for arg in "$@"; do + case "$arg" in + --what-if) WHATIF=1 ;; + --keep-data) KEEP_DATA=1 ;; + *) echo "Unknown argument: $arg" >&2; exit 1 ;; + esac +done + +RESTORE_ROOT="/mnt/user/backups/restore-lab/nextcloud" +REPORT_ROOT="/mnt/user/backups/restore-reports" +EXTRACT_DIR="$BORG_RESTORE_HOST_ROOT/nextcloud-extract" +COMPOSE_FILE="$SCRIPT_DIR/nextcloud-compose.test.yml" +REPORT_FILE="$REPORT_ROOT/nextcloud-$(date +%F).md" + +if [ "$WHATIF" -eq 1 ]; then + cat < nextcloud.dump +- HTTP 200/302/3xx von 127.0.0.1:18180 +- occ status: maintenance=false +EOF + exit 0 +fi + +require_cmd docker +require_cmd curl +require_path "$BORG_PASSPHRASE_FILE_DEFAULT" +require_path "$COMPOSE_FILE" + +RESTORE_SUCCESS=0 +cleanup() { + cleanup_compose "$COMPOSE_FILE" + if [ "$RESTORE_SUCCESS" -ne 1 ]; then + preserve_on_failure "nextcloud" "$RESTORE_ROOT" + rm -rf "$EXTRACT_DIR" + return + fi + if [ "$KEEP_DATA" -ne 1 ]; then + rm -rf "$RESTORE_ROOT" + fi + rm -rf "$EXTRACT_DIR" +} +trap cleanup EXIT + +rm -rf "$EXTRACT_DIR" "$RESTORE_ROOT" +mkdir -p "$RESTORE_ROOT/html" "$RESTORE_ROOT/data" "$RESTORE_ROOT/postgres" "$RESTORE_ROOT/dumps/latest" + +archive="$(latest_archive_name)" +repo="$(borg_repo_url)" + +if [ -z "$archive" ] || [ -z "$repo" ]; then + echo "Could not resolve Borg repo/archive from borg-ui database" >&2 + exit 1 +fi + +# Stufe 1: Nextcloud-App-Pfade und Dump aus Borg extrahieren. +# Wir extrahieren html (App-Code + config) und den Dump. +# Nutzdaten (local/documents/nextcloud-data) extrahieren wir NUR das +# Verzeichnis-Listing (--strip-components wuerde hier nicht helfen), +# um zu pruefen, dass der Pfad im Archiv existiert. Die vollen +# Nutzdaten sind zu gross fuer einen regelmaessigen Smoke. +borg_extract "/restore/nextcloud-extract" \ + "local/appdata/nextcloud/html" \ + "local/borg-dumps/latest/nextcloud.dump" + +if [ ! -d "$EXTRACT_DIR/local/appdata/nextcloud/html" ]; then + echo "Nextcloud html path missing in Borg archive" >&2 + exit 1 +fi +if [ ! -f "$EXTRACT_DIR/local/borg-dumps/latest/nextcloud.dump" ]; then + echo "nextcloud.dump missing in Borg archive" >&2 + exit 1 +fi + +# App-Code + Config ins Restore-Lab verschieben +cp -a "$EXTRACT_DIR/local/appdata/nextcloud/html/." "$RESTORE_ROOT/html/" +mv "$EXTRACT_DIR/local/borg-dumps/latest/nextcloud.dump" "$RESTORE_ROOT/dumps/latest/nextcloud.dump" + +# Nextcloud braucht einen beschreibbaren data-Pfad, auch wenn er leer ist. +# Im Restore-Lab ist das /mnt/user/backups/restore-lab/nextcloud/data. +mkdir -p "$RESTORE_ROOT/data" +chmod -R a+rwX "$RESTORE_ROOT/data" + +# Falls config.php einen anderen dbuser als das Test-Compose hat, patchen +# wir die DB-Zugangsdaten in der restaurierten config.php fuer den Test. +CONFIG_PHP="$RESTORE_ROOT/html/config/config.php" +if [ -f "$CONFIG_PHP" ]; then + # Backup der Originalkonfig fuer Diagnose + cp "$CONFIG_PHP" "$RESTORE_ROOT/html/config/config.php.original" + + # DB-Credentials auf die Test-Werte umbiegen. Nextcloud config.php + # ist PHP; wir patchen die relevanten Zeilen per sed. + sed -i \ + -e "s|'dbhost'.*|'dbhost' => 'restoretest-nextcloud-postgres',|" \ + -e "s|'dbuser'.*|'dbuser' => 'nextcloud',|" \ + -e "s|'dbpassword'.*|'dbpassword' => 'restoretest-nextcloud-db',|" \ + -e "s|'dbname'.*|'dbname' => 'nextcloud',|" \ + -e "s|'dbport'.*|'dbport' => '',|" \ + -e "s|'redis'.*=>.*array.*|'redis' => array( 'host' => 'restoretest-nextcloud-redis', 'port' => 6379 ),|" \ + "$CONFIG_PHP" + + # trusted_domains: 127.0.0.1 hinzufuegen, damit der Smoke-Endpunkt akzeptiert wird + # Nextcloud prueft trusted_domains und blockt sonst mit "Access through untrusted domain" + if ! grep -q "127.0.0.1" "$CONFIG_PHP"; then + sed -i "/trusted_domains/,/)/s|);| 999 => '127.0.0.1',\n);|" "$CONFIG_PHP" || true + fi + + config_patched="ok" +else + config_patched="no config.php found" +fi + +# Stufe 2: Test-Postgres + Test-Redis hochfahren +docker compose -f "$COMPOSE_FILE" up -d restoretest-nextcloud-postgres restoretest-nextcloud-redis >/dev/null +until docker exec restoretest-nextcloud-postgres pg_isready -U nextcloud -d nextcloud >/dev/null 2>&1; do + sleep 2 +done + +# Stufe 3: Dump einspielen (mit Retry wie bei Paperless/Immich) +restore_ok=0 +for attempt in $(seq 1 12); do + if docker exec -i restoretest-nextcloud-postgres \ + pg_restore -U nextcloud -d nextcloud --clean --if-exists --no-owner --no-privileges \ + < "$RESTORE_ROOT/dumps/latest/nextcloud.dump" 2>/tmp/nextcloud-pg-restore.err; then + restore_ok=1 + break + fi + if grep -qiE "starting up|shutting down|connection refused|database .* does not exist" /tmp/nextcloud-pg-restore.err; then + sleep 5 + continue + fi + # pg_restore mit --clean erzeugt "does not exist"-Warnungen fuer nicht vorhandene + # Objekte beim ersten Import. Diese sind erwartbar und kein echter Fehler. + # Wir pruefen auf harte Fehler. + if grep -qiE "FATAL|PANIC" /tmp/nextcloud-pg-restore.err; then + cat /tmp/nextcloud-pg-restore.err >&2 + exit 1 + fi + restore_ok=1 + break +done + +if [ "$restore_ok" -ne 1 ]; then + cat /tmp/nextcloud-pg-restore.err >&2 + exit 1 +fi + +# Stufe 4: Nextcloud starten +docker compose -f "$COMPOSE_FILE" up -d restoretest-nextcloud >/dev/null + +# Nextcloud braucht beim ersten Start mit existierender config.php einige +# Sekunden fuer DB-Migrations-Checks. Wir geben bis zu 180s. +http_status="" +for _ in $(seq 1 90); do + http_status="$(curl -s -o /tmp/nextcloud-body.html -w '%{http_code}' \ + -L http://127.0.0.1:18180/status.php || true)" + if [ "$http_status" = "200" ]; then + break + fi + sleep 2 +done + +if [ "$http_status" != "200" ]; then + echo "Nextcloud HTTP smoke failed: status=$http_status" >&2 + docker logs --tail 120 restoretest-nextcloud >&2 || true + exit 1 +fi + +# Stufe 5: occ status pruefen (maintenance mode) +occ_output="$(docker exec -u www-data restoretest-nextcloud php occ status --output=json 2>/dev/null || echo '{}')" +maintenance="$(echo "$occ_output" | grep -o '"maintenance":[a-z]*' | head -1 | cut -d: -f2)" +if [ -z "$maintenance" ]; then + maintenance="unknown" +fi + +# DB-Tabellen-Count als fachlicher Sanity-Check +table_count="$(docker exec restoretest-nextcloud-postgres \ + psql -U nextcloud -d nextcloud -tAc \ + "SELECT count(*) FROM information_schema.tables WHERE table_schema='public';" \ + 2>/dev/null | tr -d '[:space:]' || echo "n/a")" + +write_report "$REPORT_FILE" < $REPORT_FILE" diff --git a/ops/restore-tests/run-restore-checks.sh b/ops/restore-tests/run-restore-checks.sh index 98bf956..9cc6292 100755 --- a/ops/restore-tests/run-restore-checks.sh +++ b/ops/restore-tests/run-restore-checks.sh @@ -40,6 +40,12 @@ case "$MODE" in fi exec "$SCRIPT_DIR/authelia-restore-test.sh" ;; + nextcloud) + if [ "$WHATIF" = "--what-if" ]; then + exec "$SCRIPT_DIR/nextcloud-restore-test.sh" --what-if + fi + exec "$SCRIPT_DIR/nextcloud-restore-test.sh" + ;; komodo-bootstrap) if [ "$WHATIF" = "--what-if" ]; then exec "$SCRIPT_DIR/komodo-bootstrap-test.sh" --what-if @@ -47,7 +53,7 @@ case "$MODE" in exec "$SCRIPT_DIR/komodo-bootstrap-test.sh" ;; *) - echo "Usage: $0 {freshness|vaultwarden|gitea|paperless|immich|authelia|komodo-bootstrap} [--what-if]" >&2 + echo "Usage: $0 {freshness|vaultwarden|gitea|paperless|immich|authelia|nextcloud|komodo-bootstrap} [--what-if]" >&2 exit 1 ;; esac diff --git a/ops/restore-tests/run-restore-job-with-ntfy.sh b/ops/restore-tests/run-restore-job-with-ntfy.sh index 0a17dd6..c2ddd20 100644 --- a/ops/restore-tests/run-restore-job-with-ntfy.sh +++ b/ops/restore-tests/run-restore-job-with-ntfy.sh @@ -7,7 +7,7 @@ SUCCESS_TOPIC="${2:-${RESTORE_SUCCESS_TOPIC:-homelab-info}}" FAILURE_TOPIC="${RESTORE_FAILURE_TOPIC:-homelab-alerts}" if [ -z "$MODE" ]; then - echo "Usage: $0 [success_topic]" >&2 + echo "Usage: $0 [success_topic]" >&2 exit 1 fi