Compare commits
2 Commits
7c50e69b44
...
84020346bc
| Author | SHA1 | Date | |
|---|---|---|---|
| 84020346bc | |||
| 1dc1c1ef17 |
@@ -2,21 +2,23 @@
|
||||
"""
|
||||
check_health.py — Homelab Alert Enricher
|
||||
=========================================
|
||||
Laedt services.yaml, prueft Docker-Health aller bekannten Abhaengigkeiten,
|
||||
Laedt services.json, prueft Docker-Health aller bekannten Abhaengigkeiten,
|
||||
liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus.
|
||||
|
||||
Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht.
|
||||
|
||||
Keine externen Abhaengigkeiten — nur Python-Standardbibliothek.
|
||||
|
||||
Verwendung:
|
||||
python3 check_health.py # alle unhealthy Container
|
||||
python3 check_health.py paperless-ngx # gezielt einen Service pruefen
|
||||
python3 check_health.py --summary # Gesamtstatus als Zusammenfassung
|
||||
|
||||
Pfad auf Host (via Komodo-Clone):
|
||||
/mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
|
||||
Pfad auf der Hermes-VM (via git pull):
|
||||
/srv/hermes-workspace/homelab-infra/ops/hermes-agent/scripts/check_health.py
|
||||
|
||||
services.yaml wird relativ zum Script-Verzeichnis gesucht:
|
||||
../services.yaml
|
||||
services.json wird relativ zum Script-Verzeichnis gesucht:
|
||||
../services.json
|
||||
"""
|
||||
|
||||
import json
|
||||
@@ -31,10 +33,10 @@ from pathlib import Path
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent.resolve()
|
||||
SERVICES_YAML_PATH = SCRIPT_DIR.parent / "services.yaml"
|
||||
SERVICES_JSON_PATH = SCRIPT_DIR.parent / "services.json"
|
||||
|
||||
# Fallback falls das Repo unter einem anderen Pfad liegt
|
||||
SERVICES_YAML_FALLBACK = Path("/mnt/user/services/homelab/ops/hermes-agent/services.yaml")
|
||||
SERVICES_JSON_FALLBACK = Path("/srv/hermes-workspace/homelab-infra/ops/hermes-agent/services.json")
|
||||
|
||||
# Dump-Warnschwelle in Stunden (aelter = Warnung)
|
||||
DUMP_WARN_HOURS = 26
|
||||
@@ -45,23 +47,17 @@ DUMP_WARN_HOURS = 26
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_services():
|
||||
"""Laedt services.yaml. Gibt (services_dict, meta_dict) zurueck."""
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
# PyYAML nicht installiert — minimaler Fallback ueber pip
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "pip", "install", "pyyaml", "-q"],
|
||||
check=True
|
||||
)
|
||||
import yaml
|
||||
|
||||
path = SERVICES_YAML_PATH if SERVICES_YAML_PATH.exists() else SERVICES_YAML_FALLBACK
|
||||
"""Laedt services.json. Gibt (services_dict, meta_dict) zurueck.
|
||||
Keine externen Abhaengigkeiten — verwendet nur json aus der Standardbibliothek."""
|
||||
path = SERVICES_JSON_PATH if SERVICES_JSON_PATH.exists() else SERVICES_JSON_FALLBACK
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"services.yaml nicht gefunden: {path}")
|
||||
raise FileNotFoundError(
|
||||
f"services.json nicht gefunden: {path}\n"
|
||||
f"Bitte 'git pull' in /srv/hermes-workspace/homelab-infra/ ausfuehren."
|
||||
)
|
||||
|
||||
with open(path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
with open(path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
return data.get("services", {}), data.get("meta", {})
|
||||
|
||||
|
||||
@@ -0,0 +1,499 @@
|
||||
{
|
||||
"meta": {
|
||||
"dump_base": "/mnt/user/backups/borg/dumps/latest",
|
||||
"appdata_base": "/mnt/user/appdata",
|
||||
"secrets_path": "/mnt/user/appdata/secrets"
|
||||
},
|
||||
"services": {
|
||||
"traefik": {
|
||||
"description": "Zentraler Reverse Proxy, TLS, Docker-Label-Routing",
|
||||
"tier": 1,
|
||||
"category": "core",
|
||||
"container_name": "traefik",
|
||||
"dependencies": [],
|
||||
"url": "https://traefik.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/traefik/dynamic", "/mnt/user/appdata/traefik/letsencrypt"],
|
||||
"first_check": "Host-Ports 80/443 erreichbar? dynamic/ korrekt auf Host synchronisiert?",
|
||||
"notes": "dynamic configs werden NICHT automatisch von Komodo deployed — manueller Host-Sync noetig"
|
||||
},
|
||||
"adguard": {
|
||||
"description": "DNS-Server / LAN DNS",
|
||||
"tier": 1,
|
||||
"category": "core",
|
||||
"container_name": "adguard",
|
||||
"dependencies": ["unbound"],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/adguard/conf", "/mnt/user/appdata/adguard/work"],
|
||||
"first_check": "Port 53 erreichbar? Unbound healthy? dns_net Konnektivitaet?",
|
||||
"notes": "Ports 53 und 8082 dokumentierte Host-Port-Ausnahmen"
|
||||
},
|
||||
"unbound": {
|
||||
"description": "Upstream DNS Resolver fuer AdGuard",
|
||||
"tier": 1,
|
||||
"category": "core",
|
||||
"container_name": "unbound",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/unbound/config"],
|
||||
"first_check": "dns_net Konnektivitaet pruefen; Container-Logs auf Fehler pruefen",
|
||||
"notes": "rebuildbar; isoliert in dns_net"
|
||||
},
|
||||
"tailscale": {
|
||||
"description": "VPN / Remote-Zugang",
|
||||
"tier": 1,
|
||||
"category": "core",
|
||||
"container_name": "tailscale",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/tailscale"],
|
||||
"first_check": "Tailscale Status auf Host pruefen; State-Datei fuer Key-Renewal vorhanden?",
|
||||
"notes": "network_mode: host; NET_ADMIN, NET_RAW, /dev/net/tun — dokumentierte VPN-Ausnahmen"
|
||||
},
|
||||
"gitea": {
|
||||
"description": "Git-Server — operative Quelle der Wahrheit fuer GitOps",
|
||||
"tier": 1,
|
||||
"category": "core",
|
||||
"container_name": "gitea",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://git.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/services/gitea/data"],
|
||||
"first_check": "HTTPS erreichbar? SQLite in /data intakt? SSH-Port 222 erreichbar?",
|
||||
"notes": "SQLite in /data — kein separater Dump; ohne externen Mirror im DR kritisch"
|
||||
},
|
||||
"authelia": {
|
||||
"description": "ForwardAuth — zentrale Authentifizierung fuer Admin-UIs",
|
||||
"tier": 1,
|
||||
"category": "security",
|
||||
"container_name": "authelia",
|
||||
"dependencies": ["postgresql17", "traefik"],
|
||||
"url": "https://auth.kaleschke.info",
|
||||
"dump_file": "postgresql17-authelia.dump",
|
||||
"data_paths": ["/mnt/user/appdata/authelia/config"],
|
||||
"first_check": "PostgreSQL healthy? SMTP via GMX erreichbar? Host-Config aktuell (Repo-Baseline != Host)?",
|
||||
"notes": "kein Redis-Session-Backend; SMTP-Notifier GMX; Repo-Baseline muss manuell in Host-Config gemerged werden"
|
||||
},
|
||||
"vaultwarden": {
|
||||
"description": "Passwort-Tresor",
|
||||
"tier": 1,
|
||||
"category": "security",
|
||||
"container_name": "vaultwarden",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://vault.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/vaultwarden"],
|
||||
"first_check": "HTTPS erreichbar? Appdata-Volume intakt?",
|
||||
"notes": "ADMIN_TOKEN_FILE; keine direkten Host-Ports"
|
||||
},
|
||||
"postgresql17": {
|
||||
"description": "Shared PostgreSQL Cluster",
|
||||
"tier": 1,
|
||||
"category": "infra",
|
||||
"container_name": "postgresql17",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/postgresql17"],
|
||||
"first_check": "backend_net Konnektivitaet? Disk-Space auf /mnt/user/appdata? pg_isready im Container?",
|
||||
"notes": "Dumps per Dienst unter dumps/latest; raw DB nicht primaerer Restore-Weg"
|
||||
},
|
||||
"komodo-core": {
|
||||
"description": "GitOps UI / API / Stack-Manager",
|
||||
"tier": 1,
|
||||
"category": "ops",
|
||||
"container_name": "komodo-core",
|
||||
"dependencies": ["komodo-mongo", "gitea", "traefik"],
|
||||
"url": "https://komodo.kaleschke.info",
|
||||
"dump_file": "komodo-mongo.archive.gz",
|
||||
"data_paths": ["/mnt/user/appdata/komodo/core"],
|
||||
"first_check": "MongoDB healthy? Gitea erreichbar? komodo_net Konnektivitaet?",
|
||||
"notes": "keine pauschale Authelia-ForwardAuth; Gitea DNS override konfiguriert"
|
||||
},
|
||||
"komodo-mongo": {
|
||||
"description": "Komodo Datenbank (MongoDB)",
|
||||
"tier": 1,
|
||||
"category": "infra",
|
||||
"container_name": "komodo-mongo",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": "komodo-mongo.archive.gz",
|
||||
"data_paths": ["/mnt/user/appdata/komodo/mongo"],
|
||||
"first_check": "komodo_net Konnektivitaet? Disk-Space? mongosh ping?",
|
||||
"notes": "Dump-Integritaet nach Major-Upgrades pruefen"
|
||||
},
|
||||
"komodo-periphery": {
|
||||
"description": "Komodo Host-Agent (Stack-Deployments)",
|
||||
"tier": 1,
|
||||
"category": "ops",
|
||||
"container_name": "komodo-periphery",
|
||||
"dependencies": ["komodo-core"],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/komodo/periphery"],
|
||||
"first_check": "Docker-Socket lesbar? /mnt/user/services gemountet? komodo_net Verbindung zu Core?",
|
||||
"notes": "Docker-Socket-Ausnahme dokumentiert; /mnt/user/services Mount fuer Stack-Workspaces"
|
||||
},
|
||||
"redis": {
|
||||
"description": "Shared Redis Cache",
|
||||
"tier": 2,
|
||||
"category": "infra",
|
||||
"container_name": "redis",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/redis"],
|
||||
"first_check": "backend_net Konnektivitaet? redis-cli ping erreichbar?",
|
||||
"notes": "transiente Daten; bewusst nicht Backup-kritisch"
|
||||
},
|
||||
"paperless-ngx": {
|
||||
"description": "Dokumentenmanagement",
|
||||
"tier": 2,
|
||||
"category": "app",
|
||||
"container_name": "paperless-ngx",
|
||||
"dependencies": ["postgresql17", "redis", "traefik"],
|
||||
"url": "https://paperless.kaleschke.info",
|
||||
"dump_file": "postgresql17-paperless.dump",
|
||||
"data_paths": [
|
||||
"/mnt/user/appdata/paperless-ngx/data",
|
||||
"/mnt/user/documents/paperless",
|
||||
"/mnt/user/documents/scans_inbox"
|
||||
],
|
||||
"first_check": "Redis healthy? PostgreSQL healthy? backend_net Konnektivitaet?",
|
||||
"notes": "DB/Redis Secrets als Stack ENV (keine _FILE Variante)"
|
||||
},
|
||||
"paperless-gpt": {
|
||||
"description": "KI-Ergaenzung fuer Paperless",
|
||||
"tier": 2,
|
||||
"category": "app",
|
||||
"container_name": "paperless-gpt",
|
||||
"dependencies": ["paperless-ngx", "traefik"],
|
||||
"url": "https://paperless-gpt.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": [
|
||||
"/mnt/user/appdata/paperless-gpt/data",
|
||||
"/mnt/user/appdata/paperless-gpt/prompts"
|
||||
],
|
||||
"first_check": "Paperless API erreichbar? LLM/Ollama erreichbar? API Token gesetzt?",
|
||||
"notes": "API Token als Stack ENV; abhaengig von laufendem Paperless"
|
||||
},
|
||||
"immich_server": {
|
||||
"description": "Foto-/Video-App",
|
||||
"tier": 2,
|
||||
"category": "app",
|
||||
"container_name": "immich_server",
|
||||
"dependencies": ["immich_postgres", "immich_redis", "immich_machine_learning", "traefik"],
|
||||
"url": "https://immich.kaleschke.info",
|
||||
"dump_file": "immich.dump",
|
||||
"data_paths": ["/mnt/user/photos/immich", "/mnt/user/photos/family_archive"],
|
||||
"first_check": "immich_postgres healthy? immich_redis healthy? ML healthy? immich_default Netz?",
|
||||
"notes": "native App-Auth; externes Fotoarchiv gemountet"
|
||||
},
|
||||
"immich_postgres": {
|
||||
"description": "Immich-Datenbank",
|
||||
"tier": 2,
|
||||
"category": "infra",
|
||||
"container_name": "immich_postgres",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": "immich.dump",
|
||||
"data_paths": ["/mnt/user/appdata/immich_postgres"],
|
||||
"first_check": "immich_default Netz? Disk-Space? pg_isready?",
|
||||
"notes": "nie ins frontend_net; immich_default Netz isoliert"
|
||||
},
|
||||
"immich_redis": {
|
||||
"description": "Immich Cache",
|
||||
"tier": 2,
|
||||
"category": "infra",
|
||||
"container_name": "immich_redis",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": [],
|
||||
"first_check": "immich_default Netz? redis-cli ping?",
|
||||
"notes": "rebuildbar; anonymes Volume — named volume als offenes TODO"
|
||||
},
|
||||
"immich_machine_learning": {
|
||||
"description": "Immich ML (Gesichtserkennung, Suche)",
|
||||
"tier": 2,
|
||||
"category": "infra",
|
||||
"container_name": "immich_machine_learning",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": [],
|
||||
"first_check": "immich_default Netz? model-cache Volume vorhanden?",
|
||||
"notes": "rebuildbar; intern-only"
|
||||
},
|
||||
"mealie": {
|
||||
"description": "Rezeptverwaltung",
|
||||
"tier": 2,
|
||||
"category": "app",
|
||||
"container_name": "mealie",
|
||||
"dependencies": ["mealie-postgres", "traefik"],
|
||||
"url": "https://mealie.kaleschke.info",
|
||||
"dump_file": "mealie.dump",
|
||||
"data_paths": ["/mnt/user/appdata/mealie/data"],
|
||||
"first_check": "mealie-postgres healthy? mealie_internal Netz erreichbar?",
|
||||
"notes": "App + DB in internem Netz getrennt (mealie_internal)"
|
||||
},
|
||||
"mealie-postgres": {
|
||||
"description": "Mealie-Datenbank",
|
||||
"tier": 2,
|
||||
"category": "infra",
|
||||
"container_name": "mealie-postgres",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": "mealie.dump",
|
||||
"data_paths": ["/mnt/user/appdata/mealie/postgres"],
|
||||
"first_check": "mealie_internal Netz? Disk-Space?",
|
||||
"notes": "interne DB; mealie_internal Netz"
|
||||
},
|
||||
"mail-archiver": {
|
||||
"description": "Mail-Archivierung (IMAP)",
|
||||
"tier": 2,
|
||||
"category": "app",
|
||||
"container_name": "mail-archiver",
|
||||
"dependencies": ["postgresql17", "authelia", "traefik"],
|
||||
"url": "https://mail.kaleschke.info",
|
||||
"dump_file": "postgresql17-mailarchiver.dump",
|
||||
"data_paths": ["/mnt/user/appdata/mailarchiver/data-protection-keys"],
|
||||
"first_check": "PostgreSQL healthy? Internet-/IMAP-Zugang? Authelia healthy?",
|
||||
"notes": "Hybrid: frontend_net fuer IMAP/Internet, backend_net fuer DB"
|
||||
},
|
||||
"nextcloud": {
|
||||
"description": "Datei-/Cloud-Dienst",
|
||||
"tier": 2,
|
||||
"category": "app",
|
||||
"container_name": "nextcloud",
|
||||
"dependencies": ["nextcloud-postgres", "nextcloud-redis", "traefik"],
|
||||
"url": "https://cloud.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": [
|
||||
"/mnt/user/appdata/nextcloud/html",
|
||||
"/mnt/user/documents/nextcloud-data"
|
||||
],
|
||||
"first_check": "nextcloud-postgres healthy? nextcloud-redis healthy? nextcloud_internal Netz?",
|
||||
"notes": "native App-Auth (kein zentrales ForwardAuth); WebDAV/CardDAV beachten"
|
||||
},
|
||||
"nextcloud-postgres": {
|
||||
"description": "Nextcloud-Datenbank",
|
||||
"tier": 2,
|
||||
"category": "infra",
|
||||
"container_name": "nextcloud-postgres",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/nextcloud/postgres"],
|
||||
"first_check": "nextcloud_internal Netz? Disk-Space?",
|
||||
"notes": "interne DB"
|
||||
},
|
||||
"nextcloud-redis": {
|
||||
"description": "Nextcloud Cache / Locking",
|
||||
"tier": 2,
|
||||
"category": "infra",
|
||||
"container_name": "nextcloud-redis",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/nextcloud/redis"],
|
||||
"first_check": "nextcloud_internal Netz? redis-cli ping?",
|
||||
"notes": "rebuildbar"
|
||||
},
|
||||
"ntfy": {
|
||||
"description": "Push-Benachrichtigungen (Alert-Backbone)",
|
||||
"tier": 2,
|
||||
"category": "app",
|
||||
"container_name": "ntfy",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://ntfy.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/ntfy"],
|
||||
"first_check": "HTTPS erreichbar? NTFY_BEHIND_PROXY=true gesetzt? Traefik healthy?",
|
||||
"notes": "KRITISCH: Ausfall bedeutet keine anderen Alerts ankommen"
|
||||
},
|
||||
"homepage": {
|
||||
"description": "Start-Dashboard",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "homepage",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://home.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/homepage"],
|
||||
"first_check": "Traefik erreichbar? Docker-Socket read-only lesbar? API-Tokens gueltig?",
|
||||
"notes": "Docker socket read-only; viele API Tokens in Config"
|
||||
},
|
||||
"uptime-kuma": {
|
||||
"description": "Monitoring / Uptime Checks",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "UptimeKuma",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://uptime.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/uptime-kuma"],
|
||||
"first_check": "Datenbank-Volume intakt? Traefik erreichbar?",
|
||||
"notes": "Monitore nach Restore manuell pruefen"
|
||||
},
|
||||
"grafana": {
|
||||
"description": "Metrik-Dashboard",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "grafana",
|
||||
"dependencies": ["influxdb3-core", "traefik"],
|
||||
"url": "https://grafana.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/grafana"],
|
||||
"first_check": "influxdb3-core healthy? Datasource-Token gesetzt? Provisioning-Konfig vorhanden?",
|
||||
"notes": "laeuft als user 0; Datasource wird provisioniert"
|
||||
},
|
||||
"influxdb3-core": {
|
||||
"description": "Zeitreihen- / Metrikdaten fuer Grafana und Home Assistant",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "influxdb3-core",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": [
|
||||
"/mnt/user/appdata/influxdb3/data",
|
||||
"/mnt/user/appdata/influxdb3/plugins"
|
||||
],
|
||||
"first_check": "LAN-Port 8181 erreichbar? 401 ohne Token = OK (erwartet). Disk-Space?",
|
||||
"notes": "LAN-only Host-Port 8181; kein frontend_net; laeuft als user 0"
|
||||
},
|
||||
"scrutiny": {
|
||||
"description": "Laufwerks- / SMART-Monitoring",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "scrutiny",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://scrutiny.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": [
|
||||
"/mnt/user/appdata/scrutiny/config",
|
||||
"/mnt/user/appdata/scrutiny/influxdb"
|
||||
],
|
||||
"first_check": "Device-Mounts vorhanden? privileged=true gesetzt? Traefik erreichbar?",
|
||||
"notes": "privileged: true dokumentierte Ausnahme"
|
||||
},
|
||||
"glances": {
|
||||
"description": "System- / Container-Monitoring",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "glances",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://glances.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": [],
|
||||
"first_check": "Docker-Socket lesbar? rootfs gemountet? Traefik erreichbar?",
|
||||
"notes": "rebuildbar; Docker-Socket und rootfs Mounts"
|
||||
},
|
||||
"borg-ui": {
|
||||
"description": "Borg Backup- / Restore UI",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "borg-ui",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://borg.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": [
|
||||
"/mnt/user/appdata/borg-ui/data",
|
||||
"/mnt/user/backups/borg/dumps"
|
||||
],
|
||||
"first_check": "Borg-Repo-Credentials vorhanden? Backup-Mounts erreichbar? Traefik healthy?",
|
||||
"notes": "breite Mounts bewusst dokumentiert; /local/secrets im DR-Scope"
|
||||
},
|
||||
"backrest": {
|
||||
"description": "Backup-Admin-Dienst",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "backrest",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://backrest.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/backrest"],
|
||||
"first_check": "Repo/SSH-Mounts erreichbar? Traefik healthy?",
|
||||
"notes": "breite Mounts bewusst dokumentiert"
|
||||
},
|
||||
"hermes-gateway": {
|
||||
"description": "Hermes Agent Gateway / AI Ops Assistant",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "hermes-gateway",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/hermes-agent/data"],
|
||||
"first_check": "hermes_net:8642/health erreichbar? SSH-Key gemountet? LLM-Provider erreichbar?",
|
||||
"notes": "kein Docker-Socket; SSH terminal backend; echte .env auf Host-Appdata"
|
||||
},
|
||||
"ddns-updater": {
|
||||
"description": "Cloudflare / DDNS Aktualisierung",
|
||||
"tier": 3,
|
||||
"category": "infra",
|
||||
"container_name": "ddns-updater",
|
||||
"dependencies": [],
|
||||
"url": null,
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/ddns-updater"],
|
||||
"first_check": "Internetzugang? Cloudflare API erreichbar? Config vorhanden?",
|
||||
"notes": "bewusst in frontend_net weil backend_net internal ist"
|
||||
},
|
||||
"code-server": {
|
||||
"description": "Web-Editor / Operations Workspace",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "code-server",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://code.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": [
|
||||
"/mnt/user/appdata/code-server",
|
||||
"/mnt/user/services/dev"
|
||||
],
|
||||
"first_check": "Traefik erreichbar? PASSWORD_FILE lesbar?",
|
||||
"notes": "PASSWORD_FILE; Workspaces bei Restore beachten"
|
||||
},
|
||||
"filebrowser": {
|
||||
"description": "Datei-Browser fuer Appdata",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "filebrowser",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://files.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/filebrowser"],
|
||||
"first_check": "Appdata-Mounts erreichbar? Traefik healthy?",
|
||||
"notes": "breiter /mnt/user/appdata Mount; Einschraenkung langfristig als TODO"
|
||||
},
|
||||
"speedtest-tracker": {
|
||||
"description": "Speedtest-Monitoring",
|
||||
"tier": 3,
|
||||
"category": "ops",
|
||||
"container_name": "speedtest-tracker",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://speedtest.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": ["/mnt/user/appdata/speedtest-tracker/config"],
|
||||
"first_check": "APP_KEY gesetzt? Internetzugang fuer Speedtest vorhanden?",
|
||||
"notes": "APP_KEY, ADMIN_PASSWORD als Stack ENV"
|
||||
},
|
||||
"bentopdf": {
|
||||
"description": "PDF-Tooling",
|
||||
"tier": 3,
|
||||
"category": "app",
|
||||
"container_name": "bentopdf",
|
||||
"dependencies": ["traefik"],
|
||||
"url": "https://pdf.kaleschke.info",
|
||||
"dump_file": null,
|
||||
"data_paths": [],
|
||||
"first_check": "COOP/COEP Middleware gesetzt? Traefik healthy?",
|
||||
"notes": "rebuildbar; keine kritische Persistenz"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,45 @@
|
||||
# Restore Tests
|
||||
|
||||
Kontrollierte Restore-Tests fuer `homelab-infra`.
|
||||
|
||||
Ziel:
|
||||
|
||||
- Backups durch echte Test-Restores verifizieren
|
||||
- produktive Pfade nicht beschreiben
|
||||
- Testlaeufe spaeter weitgehend automatisieren
|
||||
|
||||
## Grundregeln
|
||||
|
||||
- Restore-Quelle bleibt im Backup-Bereich, z. B. `/mnt/user/backups/borg`
|
||||
- Test-Restores laufen nur in `/mnt/user/backups/restore-lab`
|
||||
- Reports landen in `/mnt/user/backups/restore-reports`
|
||||
- Test-Container nutzen das Praefix `restoretest-`
|
||||
- keine produktiven Volumes schreibend mounten
|
||||
- keine produktiven Domains fuer Testinstanzen uebernehmen
|
||||
|
||||
## Geplante Struktur
|
||||
|
||||
- `schedule.md`: Intervalle und Verantwortlichkeiten
|
||||
- `vaultwarden-restore-test.ps1`: erster Mini-Restore-Ablauf
|
||||
- `vaultwarden-plan.md`: konkreter Vaultwarden-Testplan
|
||||
- `vaultwarden-compose.test.yml`: isolierte Testinstanz fuer Vaultwarden
|
||||
- spaeter weitere Tests fuer `gitea` und `paperless`
|
||||
|
||||
## Automatisierungsmodell
|
||||
|
||||
- Ausfuehrung: Unraid User Script / Host-Job
|
||||
- Logik: Repo-Skripte in diesem Verzeichnis
|
||||
- Ergebnis: Markdown-Report
|
||||
- Meldung: `ntfy`
|
||||
- Hermes: optional nur fuer Zusammenfassung und Auswertung
|
||||
|
||||
## Status
|
||||
|
||||
Aktuell ist hier nur die erste Repo-Vorbereitung angelegt.
|
||||
|
||||
- noch kein echter Restore-Lauf
|
||||
- noch keine Host-Pfade beschrieben
|
||||
- noch keine Container gestartet
|
||||
- erster V1-Ablauf ohne `ntfy`, mit Bereinigung nach Erfolg
|
||||
|
||||
Vor dem ersten echten Testlauf muessen Zielpfade, Quellpfade und Bereinigungsschritte bewusst freigegeben werden.
|
||||
@@ -0,0 +1,74 @@
|
||||
# Restore Test Schedule
|
||||
|
||||
## Ziel
|
||||
|
||||
Regelmaessige Restore-Tests mit wenig Handarbeit und klaren Nachweisen.
|
||||
|
||||
## Intervalle
|
||||
|
||||
Woechentlich:
|
||||
|
||||
- Backup-/Dump-Frische pruefen
|
||||
- keine echten Restore-Container starten
|
||||
- pruefen:
|
||||
- Dump-Dateien vorhanden
|
||||
- Dump-Dateien nicht zu alt
|
||||
- letzte Reports vorhanden
|
||||
|
||||
Monatlich:
|
||||
|
||||
- `vaultwarden` Mini-Restore
|
||||
- `gitea` Mini-Restore, versetzt zum Vaultwarden-Lauf
|
||||
|
||||
Alle 2 Monate:
|
||||
|
||||
- `paperless` Mini-Restore
|
||||
|
||||
Quartalsweise:
|
||||
|
||||
- Restore-/DR-Sanity-Check
|
||||
- pruefen:
|
||||
- Restore-Lab-Struktur
|
||||
- Reports
|
||||
- Skripte und Pfade
|
||||
- Doku noch passend
|
||||
|
||||
Spaeter:
|
||||
|
||||
- `immich` als eigener Sprint
|
||||
|
||||
## Automatisierung
|
||||
|
||||
Automatisch:
|
||||
|
||||
- Testpfad vorbereiten
|
||||
- Restore in `restore-lab`
|
||||
- Testcontainer `restoretest-*` starten
|
||||
- Smoke-Test ausfuehren
|
||||
- Markdown-Report schreiben
|
||||
- `ntfy` Erfolg/Fehler senden
|
||||
- alte Testartefakte bereinigen
|
||||
|
||||
Manuell:
|
||||
|
||||
- neue Testfaelle einfuehren
|
||||
- riskante oder produktionsnahe Sondertests
|
||||
- Aenderungen an Restore-Logik
|
||||
- Freigabe fuer den ersten echten Restore je Dienst
|
||||
|
||||
## Ablage
|
||||
|
||||
- Quelle: `/mnt/user/backups/borg`
|
||||
- Restore-Lab: `/mnt/user/backups/restore-lab`
|
||||
- Reports: `/mnt/user/backups/restore-reports`
|
||||
|
||||
## Erfolgsregel
|
||||
|
||||
Ein Test gilt erst dann als erfolgreich, wenn:
|
||||
|
||||
- Restore abgeschlossen ist
|
||||
- Testcontainer startet
|
||||
- definierter Smoke-Test erfolgreich ist
|
||||
- Report geschrieben wurde
|
||||
|
||||
Nur "Container laeuft" reicht nicht.
|
||||
@@ -0,0 +1,25 @@
|
||||
services:
|
||||
restoretest-vaultwarden:
|
||||
image: vaultwarden/server:latest@sha256:9a8eec71f4a52411cc43edc7a50f33e9b6f62b5baca0dd95f0c6e7fd60f1a341
|
||||
container_name: restoretest-vaultwarden
|
||||
restart: "no"
|
||||
|
||||
environment:
|
||||
TZ: Europe/Berlin
|
||||
DOMAIN: http://127.0.0.1:18080
|
||||
WEBSOCKET_ENABLED: "true"
|
||||
SIGNUPS_ALLOWED: "false"
|
||||
INVITATIONS_ALLOWED: "false"
|
||||
ADMIN_TOKEN_FILE: /run/secrets/admin_token
|
||||
ROCKET_PORT: 80
|
||||
ROCKET_ADDRESS: 0.0.0.0
|
||||
|
||||
ports:
|
||||
- "127.0.0.1:18080:80"
|
||||
|
||||
volumes:
|
||||
- /mnt/user/backups/restore-lab/vaultwarden/data:/data
|
||||
- /mnt/user/appdata/secrets/vaultwarden_admin_token.txt:/run/secrets/admin_token:ro
|
||||
|
||||
security_opt:
|
||||
- no-new-privileges:true
|
||||
@@ -0,0 +1,55 @@
|
||||
# Vaultwarden Restore Test Plan
|
||||
|
||||
## Ziel
|
||||
|
||||
Nachweisen, dass ein Vaultwarden-Backup in einer isolierten Testumgebung wieder startbar und fachlich nutzbar ist.
|
||||
|
||||
## Quelle
|
||||
|
||||
- Backup-Quelle: Borg / Share-Backup
|
||||
- fachlich relevanter Datenpfad: `/mnt/user/appdata/vaultwarden`
|
||||
- Secret: `/mnt/user/appdata/secrets/vaultwarden_admin_token.txt`
|
||||
|
||||
## Test-Ziel
|
||||
|
||||
- Restore-Lab: `/mnt/user/backups/restore-lab/vaultwarden`
|
||||
- Testdatenpfad: `/mnt/user/backups/restore-lab/vaultwarden/data`
|
||||
- Testcontainer: `restoretest-vaultwarden`
|
||||
- Testport: `127.0.0.1:18080:80`
|
||||
- Report-Ziel: `/mnt/user/backups/restore-reports/vaultwarden-YYYY-MM-DD.md`
|
||||
|
||||
## Schutzregeln
|
||||
|
||||
- produktiven Pfad `/mnt/user/appdata/vaultwarden` nie beschreiben
|
||||
- produktive Domain `vault.kaleschke.info` nicht fuer die Testinstanz uebernehmen
|
||||
- keine Traefik-Labels fuer die Testinstanz
|
||||
- Testcontainer nur gegen Restore-Lab-Daten starten
|
||||
|
||||
## Geplanter Ablauf
|
||||
|
||||
1. Restore-Ziel unter `/mnt/user/backups/restore-lab/vaultwarden` vorbereiten
|
||||
2. Vaultwarden-Daten aus Backup in `restore-lab/vaultwarden/data` wiederherstellen
|
||||
3. Testinstanz mit `ops/restore-tests/vaultwarden-compose.test.yml` starten
|
||||
4. lokalen Smoke-Test gegen `http://127.0.0.1:18080` ausfuehren
|
||||
5. Report unter `/mnt/user/backups/restore-reports/` schreiben
|
||||
6. Testcontainer stoppen und Testumgebung bereinigen oder bewusst stehen lassen
|
||||
|
||||
## Smoke-Test
|
||||
|
||||
Minimal erfolgreich:
|
||||
|
||||
- Container startet
|
||||
- Login-Seite antwortet
|
||||
- Vaultwarden-Daten sind vorhanden
|
||||
|
||||
Optional spaeter:
|
||||
|
||||
- Admin-Endpunkt pruefen
|
||||
- Websocket-Endpunkt pruefen
|
||||
- Anzahl/Vorhandensein zentraler Daten artefaktisch verifizieren
|
||||
|
||||
## Noch offen vor dem ersten echten Lauf
|
||||
|
||||
- exakter Borg-Restore-Befehl bzw. Restore-Quelle auf dem Host
|
||||
- Bereinigungsstrategie fuer alte Restore-Lab-Daten
|
||||
- ob Reports nur auf dem Host liegen oder zusaetzlich per ntfy referenziert werden
|
||||
@@ -0,0 +1,34 @@
|
||||
param(
|
||||
[string]$BackupSource = "/mnt/user/backups/borg",
|
||||
[string]$RestoreRoot = "/mnt/user/backups/restore-lab/vaultwarden",
|
||||
[string]$ReportRoot = "/mnt/user/backups/restore-reports",
|
||||
[switch]$WhatIf
|
||||
)
|
||||
|
||||
Set-StrictMode -Version Latest
|
||||
$ErrorActionPreference = "Stop"
|
||||
|
||||
Write-Output "Vaultwarden restore test scaffold"
|
||||
Write-Output "BackupSource: $BackupSource"
|
||||
Write-Output "RestoreRoot: $RestoreRoot"
|
||||
Write-Output "ReportRoot: $ReportRoot"
|
||||
Write-Output "Expected Borg source path inside archive: local/appdata/vaultwarden"
|
||||
|
||||
if ($WhatIf) {
|
||||
Write-Output "Mode: WhatIf"
|
||||
} else {
|
||||
Write-Output "Mode: Scaffold only"
|
||||
}
|
||||
|
||||
Write-Output ""
|
||||
Write-Output "Planned steps:"
|
||||
Write-Output "1. Prepare restore-lab target under /mnt/user/backups/restore-lab/vaultwarden"
|
||||
Write-Output "2. Restore Vaultwarden data into an isolated test path"
|
||||
Write-Output ' Template: borg extract "$BORG_REPO" "::ARCHIVE_NAME" local/appdata/vaultwarden'
|
||||
Write-Output "3. Start container restoretest-vaultwarden against test data only"
|
||||
Write-Output "4. Run smoke checks against the test instance"
|
||||
Write-Output "5. Write markdown report under /mnt/user/backups/restore-reports"
|
||||
Write-Output "6. Send optional ntfy result"
|
||||
Write-Output ""
|
||||
Write-Output "This script is intentionally a scaffold only."
|
||||
Write-Output "No restore, no container start, no file write is executed yet."
|
||||
@@ -0,0 +1,97 @@
|
||||
# Vaultwarden Restore Runbook
|
||||
|
||||
## Vorbedingungen
|
||||
|
||||
- Borg-Quelle ist verfuegbar
|
||||
- Secret-Datei vorhanden: `/mnt/user/appdata/secrets/vaultwarden_admin_token.txt`
|
||||
- Testpfade unter `/mnt/user/backups/restore-lab/` und `/mnt/user/backups/restore-reports/` sind freigegeben
|
||||
|
||||
## Platzhalter
|
||||
|
||||
- `ARCHIVE_NAME`: Borg-Archiv fuer den Restore-Test
|
||||
- `REPORT_DATE`: z. B. `2026-05-06`
|
||||
- `BORG_REPO`: Host-Borg-Repo, z. B. das produktive `critical_infra`
|
||||
- `BORG_PASSPHRASE`: wie im bestehenden Host-Setup
|
||||
|
||||
## Ablauf
|
||||
|
||||
1. Testpfade vorbereiten
|
||||
|
||||
```bash
|
||||
mkdir -p /mnt/user/backups/restore-lab/vaultwarden/data
|
||||
mkdir -p /mnt/user/backups/restore-reports
|
||||
rm -rf /mnt/user/backups/restore-lab/vaultwarden/data/*
|
||||
```
|
||||
|
||||
2. Vaultwarden-Daten aus Borg in das Restore-Lab extrahieren
|
||||
|
||||
Archiv zuerst pruefen:
|
||||
|
||||
```bash
|
||||
export BORG_REPO='...'
|
||||
export BORG_PASSPHRASE='...'
|
||||
borg list "$BORG_REPO"
|
||||
```
|
||||
|
||||
Restore in das Testziel:
|
||||
|
||||
```bash
|
||||
cd /mnt/user/backups/restore-lab/vaultwarden
|
||||
borg extract "$BORG_REPO" "::ARCHIVE_NAME" local/appdata/vaultwarden
|
||||
mv /mnt/user/backups/restore-lab/vaultwarden/local/appdata/vaultwarden /mnt/user/backups/restore-lab/vaultwarden/data
|
||||
rmdir /mnt/user/backups/restore-lab/vaultwarden/local/appdata
|
||||
rmdir /mnt/user/backups/restore-lab/vaultwarden/local
|
||||
```
|
||||
|
||||
Wenn das Archiv den Pfad anders ablegt, zuerst mit `borg list "$BORG_REPO" "::ARCHIVE_NAME"` den exakten Eintrag pruefen.
|
||||
|
||||
Zielpfad nach dem Restore:
|
||||
|
||||
```text
|
||||
/mnt/user/backups/restore-lab/vaultwarden/data
|
||||
```
|
||||
|
||||
3. Testcontainer starten
|
||||
|
||||
```bash
|
||||
docker compose -f /mnt/user/services/homelab/ops/restore-tests/vaultwarden-compose.test.yml up -d
|
||||
```
|
||||
|
||||
4. Smoke-Test
|
||||
|
||||
```bash
|
||||
curl -I http://127.0.0.1:18080
|
||||
docker logs restoretest-vaultwarden --tail 50
|
||||
```
|
||||
|
||||
Minimal erfolgreich:
|
||||
|
||||
- HTTP-Antwort kommt
|
||||
- Login-Seite ist erreichbar
|
||||
- Vaultwarden-Daten liegen im Restore-Lab vor
|
||||
|
||||
5. Testcontainer wieder stoppen
|
||||
|
||||
```bash
|
||||
docker compose -f /mnt/user/services/homelab/ops/restore-tests/vaultwarden-compose.test.yml down
|
||||
```
|
||||
|
||||
6. Report schreiben
|
||||
|
||||
Ziel:
|
||||
|
||||
```text
|
||||
/mnt/user/backups/restore-reports/vaultwarden-REPORT_DATE.md
|
||||
```
|
||||
|
||||
7. Testdaten nach erfolgreichem Lauf bereinigen
|
||||
|
||||
```bash
|
||||
rm -rf /mnt/user/backups/restore-lab/vaultwarden/data
|
||||
```
|
||||
|
||||
## Festgelegte Entscheidungen
|
||||
|
||||
- Testdaten werden nach erfolgreichem Lauf geloescht.
|
||||
- `ntfy` wird nicht im ersten echten Lauf eingebunden.
|
||||
- `ntfy` folgt spaeter, wenn der manuelle Basisablauf stabil verifiziert ist.
|
||||
Reference in New Issue
Block a user