hermes update

hermes next level
This commit is contained in:
2026-05-06 20:13:48 +02:00
parent 1dc1c1ef17
commit 84020346bc
2 changed files with 517 additions and 22 deletions
+18 -22
View File
@@ -2,21 +2,23 @@
"""
check_health.py — Homelab Alert Enricher
=========================================
Laedt services.yaml, prueft Docker-Health aller bekannten Abhaengigkeiten,
Laedt services.json, prueft Docker-Health aller bekannten Abhaengigkeiten,
liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus.
Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht.
Keine externen Abhaengigkeiten — nur Python-Standardbibliothek.
Verwendung:
python3 check_health.py # alle unhealthy Container
python3 check_health.py paperless-ngx # gezielt einen Service pruefen
python3 check_health.py --summary # Gesamtstatus als Zusammenfassung
Pfad auf Host (via Komodo-Clone):
/mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
Pfad auf der Hermes-VM (via git pull):
/srv/hermes-workspace/homelab-infra/ops/hermes-agent/scripts/check_health.py
services.yaml wird relativ zum Script-Verzeichnis gesucht:
../services.yaml
services.json wird relativ zum Script-Verzeichnis gesucht:
../services.json
"""
import json
@@ -31,10 +33,10 @@ from pathlib import Path
# ---------------------------------------------------------------------------
SCRIPT_DIR = Path(__file__).parent.resolve()
SERVICES_YAML_PATH = SCRIPT_DIR.parent / "services.yaml"
SERVICES_JSON_PATH = SCRIPT_DIR.parent / "services.json"
# Fallback falls das Repo unter einem anderen Pfad liegt
SERVICES_YAML_FALLBACK = Path("/mnt/user/services/homelab/ops/hermes-agent/services.yaml")
SERVICES_JSON_FALLBACK = Path("/srv/hermes-workspace/homelab-infra/ops/hermes-agent/services.json")
# Dump-Warnschwelle in Stunden (aelter = Warnung)
DUMP_WARN_HOURS = 26
@@ -45,23 +47,17 @@ DUMP_WARN_HOURS = 26
# ---------------------------------------------------------------------------
def load_services():
"""Laedt services.yaml. Gibt (services_dict, meta_dict) zurueck."""
try:
import yaml
except ImportError:
# PyYAML nicht installiert — minimaler Fallback ueber pip
subprocess.run(
[sys.executable, "-m", "pip", "install", "pyyaml", "-q"],
check=True
)
import yaml
path = SERVICES_YAML_PATH if SERVICES_YAML_PATH.exists() else SERVICES_YAML_FALLBACK
"""Laedt services.json. Gibt (services_dict, meta_dict) zurueck.
Keine externen Abhaengigkeiten — verwendet nur json aus der Standardbibliothek."""
path = SERVICES_JSON_PATH if SERVICES_JSON_PATH.exists() else SERVICES_JSON_FALLBACK
if not path.exists():
raise FileNotFoundError(f"services.yaml nicht gefunden: {path}")
raise FileNotFoundError(
f"services.json nicht gefunden: {path}\n"
f"Bitte 'git pull' in /srv/hermes-workspace/homelab-infra/ ausfuehren."
)
with open(path) as f:
data = yaml.safe_load(f)
with open(path, encoding="utf-8") as f:
data = json.load(f)
return data.get("services", {}), data.get("meta", {})
+499
View File
@@ -0,0 +1,499 @@
{
"meta": {
"dump_base": "/mnt/user/backups/borg/dumps/latest",
"appdata_base": "/mnt/user/appdata",
"secrets_path": "/mnt/user/appdata/secrets"
},
"services": {
"traefik": {
"description": "Zentraler Reverse Proxy, TLS, Docker-Label-Routing",
"tier": 1,
"category": "core",
"container_name": "traefik",
"dependencies": [],
"url": "https://traefik.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/traefik/dynamic", "/mnt/user/appdata/traefik/letsencrypt"],
"first_check": "Host-Ports 80/443 erreichbar? dynamic/ korrekt auf Host synchronisiert?",
"notes": "dynamic configs werden NICHT automatisch von Komodo deployed — manueller Host-Sync noetig"
},
"adguard": {
"description": "DNS-Server / LAN DNS",
"tier": 1,
"category": "core",
"container_name": "adguard",
"dependencies": ["unbound"],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/adguard/conf", "/mnt/user/appdata/adguard/work"],
"first_check": "Port 53 erreichbar? Unbound healthy? dns_net Konnektivitaet?",
"notes": "Ports 53 und 8082 dokumentierte Host-Port-Ausnahmen"
},
"unbound": {
"description": "Upstream DNS Resolver fuer AdGuard",
"tier": 1,
"category": "core",
"container_name": "unbound",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/unbound/config"],
"first_check": "dns_net Konnektivitaet pruefen; Container-Logs auf Fehler pruefen",
"notes": "rebuildbar; isoliert in dns_net"
},
"tailscale": {
"description": "VPN / Remote-Zugang",
"tier": 1,
"category": "core",
"container_name": "tailscale",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/tailscale"],
"first_check": "Tailscale Status auf Host pruefen; State-Datei fuer Key-Renewal vorhanden?",
"notes": "network_mode: host; NET_ADMIN, NET_RAW, /dev/net/tun — dokumentierte VPN-Ausnahmen"
},
"gitea": {
"description": "Git-Server — operative Quelle der Wahrheit fuer GitOps",
"tier": 1,
"category": "core",
"container_name": "gitea",
"dependencies": ["traefik"],
"url": "https://git.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/services/gitea/data"],
"first_check": "HTTPS erreichbar? SQLite in /data intakt? SSH-Port 222 erreichbar?",
"notes": "SQLite in /data — kein separater Dump; ohne externen Mirror im DR kritisch"
},
"authelia": {
"description": "ForwardAuth — zentrale Authentifizierung fuer Admin-UIs",
"tier": 1,
"category": "security",
"container_name": "authelia",
"dependencies": ["postgresql17", "traefik"],
"url": "https://auth.kaleschke.info",
"dump_file": "postgresql17-authelia.dump",
"data_paths": ["/mnt/user/appdata/authelia/config"],
"first_check": "PostgreSQL healthy? SMTP via GMX erreichbar? Host-Config aktuell (Repo-Baseline != Host)?",
"notes": "kein Redis-Session-Backend; SMTP-Notifier GMX; Repo-Baseline muss manuell in Host-Config gemerged werden"
},
"vaultwarden": {
"description": "Passwort-Tresor",
"tier": 1,
"category": "security",
"container_name": "vaultwarden",
"dependencies": ["traefik"],
"url": "https://vault.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/vaultwarden"],
"first_check": "HTTPS erreichbar? Appdata-Volume intakt?",
"notes": "ADMIN_TOKEN_FILE; keine direkten Host-Ports"
},
"postgresql17": {
"description": "Shared PostgreSQL Cluster",
"tier": 1,
"category": "infra",
"container_name": "postgresql17",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/postgresql17"],
"first_check": "backend_net Konnektivitaet? Disk-Space auf /mnt/user/appdata? pg_isready im Container?",
"notes": "Dumps per Dienst unter dumps/latest; raw DB nicht primaerer Restore-Weg"
},
"komodo-core": {
"description": "GitOps UI / API / Stack-Manager",
"tier": 1,
"category": "ops",
"container_name": "komodo-core",
"dependencies": ["komodo-mongo", "gitea", "traefik"],
"url": "https://komodo.kaleschke.info",
"dump_file": "komodo-mongo.archive.gz",
"data_paths": ["/mnt/user/appdata/komodo/core"],
"first_check": "MongoDB healthy? Gitea erreichbar? komodo_net Konnektivitaet?",
"notes": "keine pauschale Authelia-ForwardAuth; Gitea DNS override konfiguriert"
},
"komodo-mongo": {
"description": "Komodo Datenbank (MongoDB)",
"tier": 1,
"category": "infra",
"container_name": "komodo-mongo",
"dependencies": [],
"url": null,
"dump_file": "komodo-mongo.archive.gz",
"data_paths": ["/mnt/user/appdata/komodo/mongo"],
"first_check": "komodo_net Konnektivitaet? Disk-Space? mongosh ping?",
"notes": "Dump-Integritaet nach Major-Upgrades pruefen"
},
"komodo-periphery": {
"description": "Komodo Host-Agent (Stack-Deployments)",
"tier": 1,
"category": "ops",
"container_name": "komodo-periphery",
"dependencies": ["komodo-core"],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/komodo/periphery"],
"first_check": "Docker-Socket lesbar? /mnt/user/services gemountet? komodo_net Verbindung zu Core?",
"notes": "Docker-Socket-Ausnahme dokumentiert; /mnt/user/services Mount fuer Stack-Workspaces"
},
"redis": {
"description": "Shared Redis Cache",
"tier": 2,
"category": "infra",
"container_name": "redis",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/redis"],
"first_check": "backend_net Konnektivitaet? redis-cli ping erreichbar?",
"notes": "transiente Daten; bewusst nicht Backup-kritisch"
},
"paperless-ngx": {
"description": "Dokumentenmanagement",
"tier": 2,
"category": "app",
"container_name": "paperless-ngx",
"dependencies": ["postgresql17", "redis", "traefik"],
"url": "https://paperless.kaleschke.info",
"dump_file": "postgresql17-paperless.dump",
"data_paths": [
"/mnt/user/appdata/paperless-ngx/data",
"/mnt/user/documents/paperless",
"/mnt/user/documents/scans_inbox"
],
"first_check": "Redis healthy? PostgreSQL healthy? backend_net Konnektivitaet?",
"notes": "DB/Redis Secrets als Stack ENV (keine _FILE Variante)"
},
"paperless-gpt": {
"description": "KI-Ergaenzung fuer Paperless",
"tier": 2,
"category": "app",
"container_name": "paperless-gpt",
"dependencies": ["paperless-ngx", "traefik"],
"url": "https://paperless-gpt.kaleschke.info",
"dump_file": null,
"data_paths": [
"/mnt/user/appdata/paperless-gpt/data",
"/mnt/user/appdata/paperless-gpt/prompts"
],
"first_check": "Paperless API erreichbar? LLM/Ollama erreichbar? API Token gesetzt?",
"notes": "API Token als Stack ENV; abhaengig von laufendem Paperless"
},
"immich_server": {
"description": "Foto-/Video-App",
"tier": 2,
"category": "app",
"container_name": "immich_server",
"dependencies": ["immich_postgres", "immich_redis", "immich_machine_learning", "traefik"],
"url": "https://immich.kaleschke.info",
"dump_file": "immich.dump",
"data_paths": ["/mnt/user/photos/immich", "/mnt/user/photos/family_archive"],
"first_check": "immich_postgres healthy? immich_redis healthy? ML healthy? immich_default Netz?",
"notes": "native App-Auth; externes Fotoarchiv gemountet"
},
"immich_postgres": {
"description": "Immich-Datenbank",
"tier": 2,
"category": "infra",
"container_name": "immich_postgres",
"dependencies": [],
"url": null,
"dump_file": "immich.dump",
"data_paths": ["/mnt/user/appdata/immich_postgres"],
"first_check": "immich_default Netz? Disk-Space? pg_isready?",
"notes": "nie ins frontend_net; immich_default Netz isoliert"
},
"immich_redis": {
"description": "Immich Cache",
"tier": 2,
"category": "infra",
"container_name": "immich_redis",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": [],
"first_check": "immich_default Netz? redis-cli ping?",
"notes": "rebuildbar; anonymes Volume — named volume als offenes TODO"
},
"immich_machine_learning": {
"description": "Immich ML (Gesichtserkennung, Suche)",
"tier": 2,
"category": "infra",
"container_name": "immich_machine_learning",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": [],
"first_check": "immich_default Netz? model-cache Volume vorhanden?",
"notes": "rebuildbar; intern-only"
},
"mealie": {
"description": "Rezeptverwaltung",
"tier": 2,
"category": "app",
"container_name": "mealie",
"dependencies": ["mealie-postgres", "traefik"],
"url": "https://mealie.kaleschke.info",
"dump_file": "mealie.dump",
"data_paths": ["/mnt/user/appdata/mealie/data"],
"first_check": "mealie-postgres healthy? mealie_internal Netz erreichbar?",
"notes": "App + DB in internem Netz getrennt (mealie_internal)"
},
"mealie-postgres": {
"description": "Mealie-Datenbank",
"tier": 2,
"category": "infra",
"container_name": "mealie-postgres",
"dependencies": [],
"url": null,
"dump_file": "mealie.dump",
"data_paths": ["/mnt/user/appdata/mealie/postgres"],
"first_check": "mealie_internal Netz? Disk-Space?",
"notes": "interne DB; mealie_internal Netz"
},
"mail-archiver": {
"description": "Mail-Archivierung (IMAP)",
"tier": 2,
"category": "app",
"container_name": "mail-archiver",
"dependencies": ["postgresql17", "authelia", "traefik"],
"url": "https://mail.kaleschke.info",
"dump_file": "postgresql17-mailarchiver.dump",
"data_paths": ["/mnt/user/appdata/mailarchiver/data-protection-keys"],
"first_check": "PostgreSQL healthy? Internet-/IMAP-Zugang? Authelia healthy?",
"notes": "Hybrid: frontend_net fuer IMAP/Internet, backend_net fuer DB"
},
"nextcloud": {
"description": "Datei-/Cloud-Dienst",
"tier": 2,
"category": "app",
"container_name": "nextcloud",
"dependencies": ["nextcloud-postgres", "nextcloud-redis", "traefik"],
"url": "https://cloud.kaleschke.info",
"dump_file": null,
"data_paths": [
"/mnt/user/appdata/nextcloud/html",
"/mnt/user/documents/nextcloud-data"
],
"first_check": "nextcloud-postgres healthy? nextcloud-redis healthy? nextcloud_internal Netz?",
"notes": "native App-Auth (kein zentrales ForwardAuth); WebDAV/CardDAV beachten"
},
"nextcloud-postgres": {
"description": "Nextcloud-Datenbank",
"tier": 2,
"category": "infra",
"container_name": "nextcloud-postgres",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/nextcloud/postgres"],
"first_check": "nextcloud_internal Netz? Disk-Space?",
"notes": "interne DB"
},
"nextcloud-redis": {
"description": "Nextcloud Cache / Locking",
"tier": 2,
"category": "infra",
"container_name": "nextcloud-redis",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/nextcloud/redis"],
"first_check": "nextcloud_internal Netz? redis-cli ping?",
"notes": "rebuildbar"
},
"ntfy": {
"description": "Push-Benachrichtigungen (Alert-Backbone)",
"tier": 2,
"category": "app",
"container_name": "ntfy",
"dependencies": ["traefik"],
"url": "https://ntfy.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/ntfy"],
"first_check": "HTTPS erreichbar? NTFY_BEHIND_PROXY=true gesetzt? Traefik healthy?",
"notes": "KRITISCH: Ausfall bedeutet keine anderen Alerts ankommen"
},
"homepage": {
"description": "Start-Dashboard",
"tier": 3,
"category": "ops",
"container_name": "homepage",
"dependencies": ["traefik"],
"url": "https://home.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/homepage"],
"first_check": "Traefik erreichbar? Docker-Socket read-only lesbar? API-Tokens gueltig?",
"notes": "Docker socket read-only; viele API Tokens in Config"
},
"uptime-kuma": {
"description": "Monitoring / Uptime Checks",
"tier": 3,
"category": "ops",
"container_name": "UptimeKuma",
"dependencies": ["traefik"],
"url": "https://uptime.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/uptime-kuma"],
"first_check": "Datenbank-Volume intakt? Traefik erreichbar?",
"notes": "Monitore nach Restore manuell pruefen"
},
"grafana": {
"description": "Metrik-Dashboard",
"tier": 3,
"category": "ops",
"container_name": "grafana",
"dependencies": ["influxdb3-core", "traefik"],
"url": "https://grafana.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/grafana"],
"first_check": "influxdb3-core healthy? Datasource-Token gesetzt? Provisioning-Konfig vorhanden?",
"notes": "laeuft als user 0; Datasource wird provisioniert"
},
"influxdb3-core": {
"description": "Zeitreihen- / Metrikdaten fuer Grafana und Home Assistant",
"tier": 3,
"category": "ops",
"container_name": "influxdb3-core",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": [
"/mnt/user/appdata/influxdb3/data",
"/mnt/user/appdata/influxdb3/plugins"
],
"first_check": "LAN-Port 8181 erreichbar? 401 ohne Token = OK (erwartet). Disk-Space?",
"notes": "LAN-only Host-Port 8181; kein frontend_net; laeuft als user 0"
},
"scrutiny": {
"description": "Laufwerks- / SMART-Monitoring",
"tier": 3,
"category": "ops",
"container_name": "scrutiny",
"dependencies": ["traefik"],
"url": "https://scrutiny.kaleschke.info",
"dump_file": null,
"data_paths": [
"/mnt/user/appdata/scrutiny/config",
"/mnt/user/appdata/scrutiny/influxdb"
],
"first_check": "Device-Mounts vorhanden? privileged=true gesetzt? Traefik erreichbar?",
"notes": "privileged: true dokumentierte Ausnahme"
},
"glances": {
"description": "System- / Container-Monitoring",
"tier": 3,
"category": "ops",
"container_name": "glances",
"dependencies": ["traefik"],
"url": "https://glances.kaleschke.info",
"dump_file": null,
"data_paths": [],
"first_check": "Docker-Socket lesbar? rootfs gemountet? Traefik erreichbar?",
"notes": "rebuildbar; Docker-Socket und rootfs Mounts"
},
"borg-ui": {
"description": "Borg Backup- / Restore UI",
"tier": 3,
"category": "ops",
"container_name": "borg-ui",
"dependencies": ["traefik"],
"url": "https://borg.kaleschke.info",
"dump_file": null,
"data_paths": [
"/mnt/user/appdata/borg-ui/data",
"/mnt/user/backups/borg/dumps"
],
"first_check": "Borg-Repo-Credentials vorhanden? Backup-Mounts erreichbar? Traefik healthy?",
"notes": "breite Mounts bewusst dokumentiert; /local/secrets im DR-Scope"
},
"backrest": {
"description": "Backup-Admin-Dienst",
"tier": 3,
"category": "ops",
"container_name": "backrest",
"dependencies": ["traefik"],
"url": "https://backrest.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/backrest"],
"first_check": "Repo/SSH-Mounts erreichbar? Traefik healthy?",
"notes": "breite Mounts bewusst dokumentiert"
},
"hermes-gateway": {
"description": "Hermes Agent Gateway / AI Ops Assistant",
"tier": 3,
"category": "ops",
"container_name": "hermes-gateway",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/hermes-agent/data"],
"first_check": "hermes_net:8642/health erreichbar? SSH-Key gemountet? LLM-Provider erreichbar?",
"notes": "kein Docker-Socket; SSH terminal backend; echte .env auf Host-Appdata"
},
"ddns-updater": {
"description": "Cloudflare / DDNS Aktualisierung",
"tier": 3,
"category": "infra",
"container_name": "ddns-updater",
"dependencies": [],
"url": null,
"dump_file": null,
"data_paths": ["/mnt/user/appdata/ddns-updater"],
"first_check": "Internetzugang? Cloudflare API erreichbar? Config vorhanden?",
"notes": "bewusst in frontend_net weil backend_net internal ist"
},
"code-server": {
"description": "Web-Editor / Operations Workspace",
"tier": 3,
"category": "ops",
"container_name": "code-server",
"dependencies": ["traefik"],
"url": "https://code.kaleschke.info",
"dump_file": null,
"data_paths": [
"/mnt/user/appdata/code-server",
"/mnt/user/services/dev"
],
"first_check": "Traefik erreichbar? PASSWORD_FILE lesbar?",
"notes": "PASSWORD_FILE; Workspaces bei Restore beachten"
},
"filebrowser": {
"description": "Datei-Browser fuer Appdata",
"tier": 3,
"category": "ops",
"container_name": "filebrowser",
"dependencies": ["traefik"],
"url": "https://files.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/filebrowser"],
"first_check": "Appdata-Mounts erreichbar? Traefik healthy?",
"notes": "breiter /mnt/user/appdata Mount; Einschraenkung langfristig als TODO"
},
"speedtest-tracker": {
"description": "Speedtest-Monitoring",
"tier": 3,
"category": "ops",
"container_name": "speedtest-tracker",
"dependencies": ["traefik"],
"url": "https://speedtest.kaleschke.info",
"dump_file": null,
"data_paths": ["/mnt/user/appdata/speedtest-tracker/config"],
"first_check": "APP_KEY gesetzt? Internetzugang fuer Speedtest vorhanden?",
"notes": "APP_KEY, ADMIN_PASSWORD als Stack ENV"
},
"bentopdf": {
"description": "PDF-Tooling",
"tier": 3,
"category": "app",
"container_name": "bentopdf",
"dependencies": ["traefik"],
"url": "https://pdf.kaleschke.info",
"dump_file": null,
"data_paths": [],
"first_check": "COOP/COEP Middleware gesetzt? Traefik healthy?",
"notes": "rebuildbar; keine kritische Persistenz"
}
}
}