diff --git a/ops/hermes-agent/scripts/check_health.py b/ops/hermes-agent/scripts/check_health.py index 8fdd1cd..d406d6b 100644 --- a/ops/hermes-agent/scripts/check_health.py +++ b/ops/hermes-agent/scripts/check_health.py @@ -2,21 +2,23 @@ """ check_health.py — Homelab Alert Enricher ========================================= -Laedt services.yaml, prueft Docker-Health aller bekannten Abhaengigkeiten, +Laedt services.json, prueft Docker-Health aller bekannten Abhaengigkeiten, liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus. Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht. +Keine externen Abhaengigkeiten — nur Python-Standardbibliothek. + Verwendung: python3 check_health.py # alle unhealthy Container python3 check_health.py paperless-ngx # gezielt einen Service pruefen python3 check_health.py --summary # Gesamtstatus als Zusammenfassung -Pfad auf Host (via Komodo-Clone): - /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py +Pfad auf der Hermes-VM (via git pull): + /srv/hermes-workspace/homelab-infra/ops/hermes-agent/scripts/check_health.py -services.yaml wird relativ zum Script-Verzeichnis gesucht: - ../services.yaml +services.json wird relativ zum Script-Verzeichnis gesucht: + ../services.json """ import json @@ -31,10 +33,10 @@ from pathlib import Path # --------------------------------------------------------------------------- SCRIPT_DIR = Path(__file__).parent.resolve() -SERVICES_YAML_PATH = SCRIPT_DIR.parent / "services.yaml" +SERVICES_JSON_PATH = SCRIPT_DIR.parent / "services.json" # Fallback falls das Repo unter einem anderen Pfad liegt -SERVICES_YAML_FALLBACK = Path("/mnt/user/services/homelab/ops/hermes-agent/services.yaml") +SERVICES_JSON_FALLBACK = Path("/srv/hermes-workspace/homelab-infra/ops/hermes-agent/services.json") # Dump-Warnschwelle in Stunden (aelter = Warnung) DUMP_WARN_HOURS = 26 @@ -45,23 +47,17 @@ DUMP_WARN_HOURS = 26 # --------------------------------------------------------------------------- def load_services(): - """Laedt services.yaml. Gibt (services_dict, meta_dict) zurueck.""" - try: - import yaml - except ImportError: - # PyYAML nicht installiert — minimaler Fallback ueber pip - subprocess.run( - [sys.executable, "-m", "pip", "install", "pyyaml", "-q"], - check=True - ) - import yaml - - path = SERVICES_YAML_PATH if SERVICES_YAML_PATH.exists() else SERVICES_YAML_FALLBACK + """Laedt services.json. Gibt (services_dict, meta_dict) zurueck. + Keine externen Abhaengigkeiten — verwendet nur json aus der Standardbibliothek.""" + path = SERVICES_JSON_PATH if SERVICES_JSON_PATH.exists() else SERVICES_JSON_FALLBACK if not path.exists(): - raise FileNotFoundError(f"services.yaml nicht gefunden: {path}") + raise FileNotFoundError( + f"services.json nicht gefunden: {path}\n" + f"Bitte 'git pull' in /srv/hermes-workspace/homelab-infra/ ausfuehren." + ) - with open(path) as f: - data = yaml.safe_load(f) + with open(path, encoding="utf-8") as f: + data = json.load(f) return data.get("services", {}), data.get("meta", {}) diff --git a/ops/hermes-agent/services.json b/ops/hermes-agent/services.json new file mode 100644 index 0000000..76aeef1 --- /dev/null +++ b/ops/hermes-agent/services.json @@ -0,0 +1,499 @@ +{ + "meta": { + "dump_base": "/mnt/user/backups/borg/dumps/latest", + "appdata_base": "/mnt/user/appdata", + "secrets_path": "/mnt/user/appdata/secrets" + }, + "services": { + "traefik": { + "description": "Zentraler Reverse Proxy, TLS, Docker-Label-Routing", + "tier": 1, + "category": "core", + "container_name": "traefik", + "dependencies": [], + "url": "https://traefik.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/traefik/dynamic", "/mnt/user/appdata/traefik/letsencrypt"], + "first_check": "Host-Ports 80/443 erreichbar? dynamic/ korrekt auf Host synchronisiert?", + "notes": "dynamic configs werden NICHT automatisch von Komodo deployed — manueller Host-Sync noetig" + }, + "adguard": { + "description": "DNS-Server / LAN DNS", + "tier": 1, + "category": "core", + "container_name": "adguard", + "dependencies": ["unbound"], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/adguard/conf", "/mnt/user/appdata/adguard/work"], + "first_check": "Port 53 erreichbar? Unbound healthy? dns_net Konnektivitaet?", + "notes": "Ports 53 und 8082 dokumentierte Host-Port-Ausnahmen" + }, + "unbound": { + "description": "Upstream DNS Resolver fuer AdGuard", + "tier": 1, + "category": "core", + "container_name": "unbound", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/unbound/config"], + "first_check": "dns_net Konnektivitaet pruefen; Container-Logs auf Fehler pruefen", + "notes": "rebuildbar; isoliert in dns_net" + }, + "tailscale": { + "description": "VPN / Remote-Zugang", + "tier": 1, + "category": "core", + "container_name": "tailscale", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/tailscale"], + "first_check": "Tailscale Status auf Host pruefen; State-Datei fuer Key-Renewal vorhanden?", + "notes": "network_mode: host; NET_ADMIN, NET_RAW, /dev/net/tun — dokumentierte VPN-Ausnahmen" + }, + "gitea": { + "description": "Git-Server — operative Quelle der Wahrheit fuer GitOps", + "tier": 1, + "category": "core", + "container_name": "gitea", + "dependencies": ["traefik"], + "url": "https://git.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/services/gitea/data"], + "first_check": "HTTPS erreichbar? SQLite in /data intakt? SSH-Port 222 erreichbar?", + "notes": "SQLite in /data — kein separater Dump; ohne externen Mirror im DR kritisch" + }, + "authelia": { + "description": "ForwardAuth — zentrale Authentifizierung fuer Admin-UIs", + "tier": 1, + "category": "security", + "container_name": "authelia", + "dependencies": ["postgresql17", "traefik"], + "url": "https://auth.kaleschke.info", + "dump_file": "postgresql17-authelia.dump", + "data_paths": ["/mnt/user/appdata/authelia/config"], + "first_check": "PostgreSQL healthy? SMTP via GMX erreichbar? Host-Config aktuell (Repo-Baseline != Host)?", + "notes": "kein Redis-Session-Backend; SMTP-Notifier GMX; Repo-Baseline muss manuell in Host-Config gemerged werden" + }, + "vaultwarden": { + "description": "Passwort-Tresor", + "tier": 1, + "category": "security", + "container_name": "vaultwarden", + "dependencies": ["traefik"], + "url": "https://vault.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/vaultwarden"], + "first_check": "HTTPS erreichbar? Appdata-Volume intakt?", + "notes": "ADMIN_TOKEN_FILE; keine direkten Host-Ports" + }, + "postgresql17": { + "description": "Shared PostgreSQL Cluster", + "tier": 1, + "category": "infra", + "container_name": "postgresql17", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/postgresql17"], + "first_check": "backend_net Konnektivitaet? Disk-Space auf /mnt/user/appdata? pg_isready im Container?", + "notes": "Dumps per Dienst unter dumps/latest; raw DB nicht primaerer Restore-Weg" + }, + "komodo-core": { + "description": "GitOps UI / API / Stack-Manager", + "tier": 1, + "category": "ops", + "container_name": "komodo-core", + "dependencies": ["komodo-mongo", "gitea", "traefik"], + "url": "https://komodo.kaleschke.info", + "dump_file": "komodo-mongo.archive.gz", + "data_paths": ["/mnt/user/appdata/komodo/core"], + "first_check": "MongoDB healthy? Gitea erreichbar? komodo_net Konnektivitaet?", + "notes": "keine pauschale Authelia-ForwardAuth; Gitea DNS override konfiguriert" + }, + "komodo-mongo": { + "description": "Komodo Datenbank (MongoDB)", + "tier": 1, + "category": "infra", + "container_name": "komodo-mongo", + "dependencies": [], + "url": null, + "dump_file": "komodo-mongo.archive.gz", + "data_paths": ["/mnt/user/appdata/komodo/mongo"], + "first_check": "komodo_net Konnektivitaet? Disk-Space? mongosh ping?", + "notes": "Dump-Integritaet nach Major-Upgrades pruefen" + }, + "komodo-periphery": { + "description": "Komodo Host-Agent (Stack-Deployments)", + "tier": 1, + "category": "ops", + "container_name": "komodo-periphery", + "dependencies": ["komodo-core"], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/komodo/periphery"], + "first_check": "Docker-Socket lesbar? /mnt/user/services gemountet? komodo_net Verbindung zu Core?", + "notes": "Docker-Socket-Ausnahme dokumentiert; /mnt/user/services Mount fuer Stack-Workspaces" + }, + "redis": { + "description": "Shared Redis Cache", + "tier": 2, + "category": "infra", + "container_name": "redis", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/redis"], + "first_check": "backend_net Konnektivitaet? redis-cli ping erreichbar?", + "notes": "transiente Daten; bewusst nicht Backup-kritisch" + }, + "paperless-ngx": { + "description": "Dokumentenmanagement", + "tier": 2, + "category": "app", + "container_name": "paperless-ngx", + "dependencies": ["postgresql17", "redis", "traefik"], + "url": "https://paperless.kaleschke.info", + "dump_file": "postgresql17-paperless.dump", + "data_paths": [ + "/mnt/user/appdata/paperless-ngx/data", + "/mnt/user/documents/paperless", + "/mnt/user/documents/scans_inbox" + ], + "first_check": "Redis healthy? PostgreSQL healthy? backend_net Konnektivitaet?", + "notes": "DB/Redis Secrets als Stack ENV (keine _FILE Variante)" + }, + "paperless-gpt": { + "description": "KI-Ergaenzung fuer Paperless", + "tier": 2, + "category": "app", + "container_name": "paperless-gpt", + "dependencies": ["paperless-ngx", "traefik"], + "url": "https://paperless-gpt.kaleschke.info", + "dump_file": null, + "data_paths": [ + "/mnt/user/appdata/paperless-gpt/data", + "/mnt/user/appdata/paperless-gpt/prompts" + ], + "first_check": "Paperless API erreichbar? LLM/Ollama erreichbar? API Token gesetzt?", + "notes": "API Token als Stack ENV; abhaengig von laufendem Paperless" + }, + "immich_server": { + "description": "Foto-/Video-App", + "tier": 2, + "category": "app", + "container_name": "immich_server", + "dependencies": ["immich_postgres", "immich_redis", "immich_machine_learning", "traefik"], + "url": "https://immich.kaleschke.info", + "dump_file": "immich.dump", + "data_paths": ["/mnt/user/photos/immich", "/mnt/user/photos/family_archive"], + "first_check": "immich_postgres healthy? immich_redis healthy? ML healthy? immich_default Netz?", + "notes": "native App-Auth; externes Fotoarchiv gemountet" + }, + "immich_postgres": { + "description": "Immich-Datenbank", + "tier": 2, + "category": "infra", + "container_name": "immich_postgres", + "dependencies": [], + "url": null, + "dump_file": "immich.dump", + "data_paths": ["/mnt/user/appdata/immich_postgres"], + "first_check": "immich_default Netz? Disk-Space? pg_isready?", + "notes": "nie ins frontend_net; immich_default Netz isoliert" + }, + "immich_redis": { + "description": "Immich Cache", + "tier": 2, + "category": "infra", + "container_name": "immich_redis", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": [], + "first_check": "immich_default Netz? redis-cli ping?", + "notes": "rebuildbar; anonymes Volume — named volume als offenes TODO" + }, + "immich_machine_learning": { + "description": "Immich ML (Gesichtserkennung, Suche)", + "tier": 2, + "category": "infra", + "container_name": "immich_machine_learning", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": [], + "first_check": "immich_default Netz? model-cache Volume vorhanden?", + "notes": "rebuildbar; intern-only" + }, + "mealie": { + "description": "Rezeptverwaltung", + "tier": 2, + "category": "app", + "container_name": "mealie", + "dependencies": ["mealie-postgres", "traefik"], + "url": "https://mealie.kaleschke.info", + "dump_file": "mealie.dump", + "data_paths": ["/mnt/user/appdata/mealie/data"], + "first_check": "mealie-postgres healthy? mealie_internal Netz erreichbar?", + "notes": "App + DB in internem Netz getrennt (mealie_internal)" + }, + "mealie-postgres": { + "description": "Mealie-Datenbank", + "tier": 2, + "category": "infra", + "container_name": "mealie-postgres", + "dependencies": [], + "url": null, + "dump_file": "mealie.dump", + "data_paths": ["/mnt/user/appdata/mealie/postgres"], + "first_check": "mealie_internal Netz? Disk-Space?", + "notes": "interne DB; mealie_internal Netz" + }, + "mail-archiver": { + "description": "Mail-Archivierung (IMAP)", + "tier": 2, + "category": "app", + "container_name": "mail-archiver", + "dependencies": ["postgresql17", "authelia", "traefik"], + "url": "https://mail.kaleschke.info", + "dump_file": "postgresql17-mailarchiver.dump", + "data_paths": ["/mnt/user/appdata/mailarchiver/data-protection-keys"], + "first_check": "PostgreSQL healthy? Internet-/IMAP-Zugang? Authelia healthy?", + "notes": "Hybrid: frontend_net fuer IMAP/Internet, backend_net fuer DB" + }, + "nextcloud": { + "description": "Datei-/Cloud-Dienst", + "tier": 2, + "category": "app", + "container_name": "nextcloud", + "dependencies": ["nextcloud-postgres", "nextcloud-redis", "traefik"], + "url": "https://cloud.kaleschke.info", + "dump_file": null, + "data_paths": [ + "/mnt/user/appdata/nextcloud/html", + "/mnt/user/documents/nextcloud-data" + ], + "first_check": "nextcloud-postgres healthy? nextcloud-redis healthy? nextcloud_internal Netz?", + "notes": "native App-Auth (kein zentrales ForwardAuth); WebDAV/CardDAV beachten" + }, + "nextcloud-postgres": { + "description": "Nextcloud-Datenbank", + "tier": 2, + "category": "infra", + "container_name": "nextcloud-postgres", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/nextcloud/postgres"], + "first_check": "nextcloud_internal Netz? Disk-Space?", + "notes": "interne DB" + }, + "nextcloud-redis": { + "description": "Nextcloud Cache / Locking", + "tier": 2, + "category": "infra", + "container_name": "nextcloud-redis", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/nextcloud/redis"], + "first_check": "nextcloud_internal Netz? redis-cli ping?", + "notes": "rebuildbar" + }, + "ntfy": { + "description": "Push-Benachrichtigungen (Alert-Backbone)", + "tier": 2, + "category": "app", + "container_name": "ntfy", + "dependencies": ["traefik"], + "url": "https://ntfy.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/ntfy"], + "first_check": "HTTPS erreichbar? NTFY_BEHIND_PROXY=true gesetzt? Traefik healthy?", + "notes": "KRITISCH: Ausfall bedeutet keine anderen Alerts ankommen" + }, + "homepage": { + "description": "Start-Dashboard", + "tier": 3, + "category": "ops", + "container_name": "homepage", + "dependencies": ["traefik"], + "url": "https://home.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/homepage"], + "first_check": "Traefik erreichbar? Docker-Socket read-only lesbar? API-Tokens gueltig?", + "notes": "Docker socket read-only; viele API Tokens in Config" + }, + "uptime-kuma": { + "description": "Monitoring / Uptime Checks", + "tier": 3, + "category": "ops", + "container_name": "UptimeKuma", + "dependencies": ["traefik"], + "url": "https://uptime.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/uptime-kuma"], + "first_check": "Datenbank-Volume intakt? Traefik erreichbar?", + "notes": "Monitore nach Restore manuell pruefen" + }, + "grafana": { + "description": "Metrik-Dashboard", + "tier": 3, + "category": "ops", + "container_name": "grafana", + "dependencies": ["influxdb3-core", "traefik"], + "url": "https://grafana.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/grafana"], + "first_check": "influxdb3-core healthy? Datasource-Token gesetzt? Provisioning-Konfig vorhanden?", + "notes": "laeuft als user 0; Datasource wird provisioniert" + }, + "influxdb3-core": { + "description": "Zeitreihen- / Metrikdaten fuer Grafana und Home Assistant", + "tier": 3, + "category": "ops", + "container_name": "influxdb3-core", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": [ + "/mnt/user/appdata/influxdb3/data", + "/mnt/user/appdata/influxdb3/plugins" + ], + "first_check": "LAN-Port 8181 erreichbar? 401 ohne Token = OK (erwartet). Disk-Space?", + "notes": "LAN-only Host-Port 8181; kein frontend_net; laeuft als user 0" + }, + "scrutiny": { + "description": "Laufwerks- / SMART-Monitoring", + "tier": 3, + "category": "ops", + "container_name": "scrutiny", + "dependencies": ["traefik"], + "url": "https://scrutiny.kaleschke.info", + "dump_file": null, + "data_paths": [ + "/mnt/user/appdata/scrutiny/config", + "/mnt/user/appdata/scrutiny/influxdb" + ], + "first_check": "Device-Mounts vorhanden? privileged=true gesetzt? Traefik erreichbar?", + "notes": "privileged: true dokumentierte Ausnahme" + }, + "glances": { + "description": "System- / Container-Monitoring", + "tier": 3, + "category": "ops", + "container_name": "glances", + "dependencies": ["traefik"], + "url": "https://glances.kaleschke.info", + "dump_file": null, + "data_paths": [], + "first_check": "Docker-Socket lesbar? rootfs gemountet? Traefik erreichbar?", + "notes": "rebuildbar; Docker-Socket und rootfs Mounts" + }, + "borg-ui": { + "description": "Borg Backup- / Restore UI", + "tier": 3, + "category": "ops", + "container_name": "borg-ui", + "dependencies": ["traefik"], + "url": "https://borg.kaleschke.info", + "dump_file": null, + "data_paths": [ + "/mnt/user/appdata/borg-ui/data", + "/mnt/user/backups/borg/dumps" + ], + "first_check": "Borg-Repo-Credentials vorhanden? Backup-Mounts erreichbar? Traefik healthy?", + "notes": "breite Mounts bewusst dokumentiert; /local/secrets im DR-Scope" + }, + "backrest": { + "description": "Backup-Admin-Dienst", + "tier": 3, + "category": "ops", + "container_name": "backrest", + "dependencies": ["traefik"], + "url": "https://backrest.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/backrest"], + "first_check": "Repo/SSH-Mounts erreichbar? Traefik healthy?", + "notes": "breite Mounts bewusst dokumentiert" + }, + "hermes-gateway": { + "description": "Hermes Agent Gateway / AI Ops Assistant", + "tier": 3, + "category": "ops", + "container_name": "hermes-gateway", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/hermes-agent/data"], + "first_check": "hermes_net:8642/health erreichbar? SSH-Key gemountet? LLM-Provider erreichbar?", + "notes": "kein Docker-Socket; SSH terminal backend; echte .env auf Host-Appdata" + }, + "ddns-updater": { + "description": "Cloudflare / DDNS Aktualisierung", + "tier": 3, + "category": "infra", + "container_name": "ddns-updater", + "dependencies": [], + "url": null, + "dump_file": null, + "data_paths": ["/mnt/user/appdata/ddns-updater"], + "first_check": "Internetzugang? Cloudflare API erreichbar? Config vorhanden?", + "notes": "bewusst in frontend_net weil backend_net internal ist" + }, + "code-server": { + "description": "Web-Editor / Operations Workspace", + "tier": 3, + "category": "ops", + "container_name": "code-server", + "dependencies": ["traefik"], + "url": "https://code.kaleschke.info", + "dump_file": null, + "data_paths": [ + "/mnt/user/appdata/code-server", + "/mnt/user/services/dev" + ], + "first_check": "Traefik erreichbar? PASSWORD_FILE lesbar?", + "notes": "PASSWORD_FILE; Workspaces bei Restore beachten" + }, + "filebrowser": { + "description": "Datei-Browser fuer Appdata", + "tier": 3, + "category": "ops", + "container_name": "filebrowser", + "dependencies": ["traefik"], + "url": "https://files.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/filebrowser"], + "first_check": "Appdata-Mounts erreichbar? Traefik healthy?", + "notes": "breiter /mnt/user/appdata Mount; Einschraenkung langfristig als TODO" + }, + "speedtest-tracker": { + "description": "Speedtest-Monitoring", + "tier": 3, + "category": "ops", + "container_name": "speedtest-tracker", + "dependencies": ["traefik"], + "url": "https://speedtest.kaleschke.info", + "dump_file": null, + "data_paths": ["/mnt/user/appdata/speedtest-tracker/config"], + "first_check": "APP_KEY gesetzt? Internetzugang fuer Speedtest vorhanden?", + "notes": "APP_KEY, ADMIN_PASSWORD als Stack ENV" + }, + "bentopdf": { + "description": "PDF-Tooling", + "tier": 3, + "category": "app", + "container_name": "bentopdf", + "dependencies": ["traefik"], + "url": "https://pdf.kaleschke.info", + "dump_file": null, + "data_paths": [], + "first_check": "COOP/COEP Middleware gesetzt? Traefik healthy?", + "notes": "rebuildbar; keine kritische Persistenz" + } + } +}