hermes update

2026-05-06 19:13:52 +02:00
parent bdef0afcb9
commit 0aa8138bdd
4 changed files with 1251 additions and 0 deletions
@@ -0,0 +1,312 @@
+#!/usr/bin/env python3
+"""
+check_health.py — Homelab Alert Enricher
+=========================================
+Laedt services.yaml, prueft Docker-Health aller bekannten Abhaengigkeiten,
+liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus.
+
+Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht.
+
+Verwendung:
+  python3 check_health.py                  # alle unhealthy Container
+  python3 check_health.py paperless-ngx    # gezielt einen Service pruefen
+  python3 check_health.py --summary        # Gesamtstatus als Zusammenfassung
+
+Pfad auf Host (via Komodo-Clone):
+  /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
+
+services.yaml wird relativ zum Script-Verzeichnis gesucht:
+  ../services.yaml
+"""
+
+import json
+import os
+import subprocess
+import sys
+from datetime import datetime
+from pathlib import Path
+
+# ---------------------------------------------------------------------------
+# Konfiguration
+# ---------------------------------------------------------------------------
+
+SCRIPT_DIR = Path(__file__).parent.resolve()
+SERVICES_YAML_PATH = SCRIPT_DIR.parent / "services.yaml"
+
+# Fallback falls das Repo unter einem anderen Pfad liegt
+SERVICES_YAML_FALLBACK = Path("/mnt/user/services/homelab/ops/hermes-agent/services.yaml")
+
+# Dump-Warnschwelle in Stunden (aelter = Warnung)
+DUMP_WARN_HOURS = 26
+
+
+# ---------------------------------------------------------------------------
+# Hilfsfunktionen
+# ---------------------------------------------------------------------------
+
+def load_services():
+    """Laedt services.yaml. Gibt (services_dict, meta_dict) zurueck."""
+    try:
+        import yaml
+    except ImportError:
+        # PyYAML nicht installiert — minimaler Fallback ueber pip
+        subprocess.run(
+            [sys.executable, "-m", "pip", "install", "pyyaml", "-q"],
+            check=True
+        )
+        import yaml
+
+    path = SERVICES_YAML_PATH if SERVICES_YAML_PATH.exists() else SERVICES_YAML_FALLBACK
+    if not path.exists():
+        raise FileNotFoundError(f"services.yaml nicht gefunden: {path}")
+
+    with open(path) as f:
+        data = yaml.safe_load(f)
+
+    return data.get("services", {}), data.get("meta", {})
+
+
+def docker_inspect(container_name: str) -> dict:
+    """
+    Gibt {'status': str, 'health': str} zurueck.
+    status:  running | exited | restarting | dead | not_found | error
+    health:  healthy | unhealthy | starting | none | unknown
+    """
+    try:
+        result = subprocess.run(
+            [
+                "docker", "inspect",
+                "--format",
+                "{{.State.Status}}|||{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}",
+                container_name,
+            ],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        if result.returncode != 0:
+            return {"status": "not_found", "health": "unknown"}
+
+        parts = result.stdout.strip().split("|||")
+        return {
+            "status": parts[0].strip() if parts else "unknown",
+            "health": parts[1].strip() if len(parts) > 1 else "none",
+        }
+    except Exception as e:
+        return {"status": "error", "health": str(e)}
+
+
+def is_healthy(inspect_result: dict) -> bool:
+    status = inspect_result.get("status", "")
+    health = inspect_result.get("health", "")
+    if status != "running":
+        return False
+    if health in ("unhealthy",):
+        return False
+    return True
+
+
+def get_unhealthy_containers() -> list[str]:
+    """Gibt Liste aller Container zurueck die unhealthy oder nicht running sind."""
+    try:
+        # unhealthy per healthcheck
+        r1 = subprocess.run(
+            ["docker", "ps", "--filter", "health=unhealthy", "--format", "{{.Names}}"],
+            capture_output=True, text=True, timeout=10,
+        )
+        # exited/dead Container die eigentlich laufen sollten
+        r2 = subprocess.run(
+            ["docker", "ps", "--filter", "status=exited", "--format", "{{.Names}}"],
+            capture_output=True, text=True, timeout=10,
+        )
+        names = set()
+        for raw in (r1.stdout, r2.stdout):
+            for name in raw.strip().split("\n"):
+                name = name.strip()
+                if name:
+                    names.add(name)
+        return sorted(names)
+    except Exception:
+        return []
+
+
+def get_dump_info(dump_file: str | None, dump_base: str) -> dict | None:
+    """Gibt Alter und Groesse des Dump-Files zurueck (oder None wenn nicht vorhanden)."""
+    if not dump_file:
+        return None
+
+    path = Path(dump_base) / dump_file
+    if not path.exists():
+        return {"file": dump_file, "exists": False, "age_hours": None, "size_mb": None}
+
+    stat = path.stat()
+    age_hours = round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1)
+    size_mb = round(stat.st_size / 1_048_576, 1)
+
+    return {
+        "file": dump_file,
+        "exists": True,
+        "age_hours": age_hours,
+        "size_mb": size_mb,
+        "warn": age_hours > DUMP_WARN_HOURS,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Report-Generierung
+# ---------------------------------------------------------------------------
+
+def build_service_report(service_key: str, service: dict, all_services: dict, meta: dict) -> dict:
+    """Erstellt einen vollstaendigen Report fuer einen einzelnen Service."""
+    dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
+
+    # Eigener Container-Status
+    own_inspect = docker_inspect(service["container_name"])
+    own_healthy = is_healthy(own_inspect)
+
+    # Abhaengigkeits-Check
+    dep_results = {}
+    for dep_key in service.get("dependencies", []):
+        dep = all_services.get(dep_key)
+        if not dep:
+            dep_results[dep_key] = {"status": "unknown_service", "health": "unknown", "healthy": False}
+            continue
+        insp = docker_inspect(dep["container_name"])
+        dep_results[dep_key] = {
+            **insp,
+            "healthy": is_healthy(insp),
+            "tier": dep.get("tier"),
+            "container_name": dep["container_name"],
+        }
+
+    unhealthy_deps = [k for k, v in dep_results.items() if not v["healthy"]]
+
+    # Dump-Info
+    dump_info = get_dump_info(service.get("dump_file"), dump_base)
+
+    return {
+        "service": service_key,
+        "description": service.get("description", ""),
+        "tier": service.get("tier"),
+        "url": service.get("url"),
+        "container": {
+            "name": service["container_name"],
+            **own_inspect,
+            "healthy": own_healthy,
+        },
+        "dependencies": dep_results,
+        "unhealthy_deps": unhealthy_deps,
+        "dump": dump_info,
+        "first_check": service.get("first_check", ""),
+        "notes": service.get("notes", ""),
+        "timestamp": datetime.now().isoformat(),
+    }
+
+
+def build_summary_report(all_services: dict, meta: dict) -> dict:
+    """Prueft alle Tier-1 und Tier-2 Dienste und gibt einen Gesamtstatus zurueck."""
+    results = {}
+    issues = []
+
+    for key, svc in all_services.items():
+        tier = svc.get("tier", 3)
+        if tier > 2:
+            continue  # Tier-3 im Summary ueberspringen
+
+        insp = docker_inspect(svc["container_name"])
+        healthy = is_healthy(insp)
+        results[key] = {
+            "tier": tier,
+            "healthy": healthy,
+            "status": insp["status"],
+            "health": insp["health"],
+        }
+        if not healthy:
+            issues.append({"service": key, "tier": tier, **insp})
+
+    # Dump-Checks fuer alle Dienste mit dump_file
+    dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
+    stale_dumps = []
+    for key, svc in all_services.items():
+        info = get_dump_info(svc.get("dump_file"), dump_base)
+        if info and info.get("warn"):
+            stale_dumps.append({
+                "service": key,
+                "file": info["file"],
+                "age_hours": info["age_hours"],
+            })
+
+    return {
+        "mode": "summary",
+        "timestamp": datetime.now().isoformat(),
+        "services_checked": len(results),
+        "issues": issues,
+        "stale_dumps": stale_dumps,
+        "overall_healthy": len(issues) == 0 and len(stale_dumps) == 0,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Einstiegspunkt
+# ---------------------------------------------------------------------------
+
+def main():
+    args = sys.argv[1:]
+    all_services, meta = load_services()
+
+    if "--summary" in args:
+        report = build_summary_report(all_services, meta)
+        print(json.dumps(report, indent=2, ensure_ascii=False))
+        return
+
+    # Expliziter Service-Key als Argument
+    if args and not args[0].startswith("--"):
+        service_key = args[0]
+        service = all_services.get(service_key)
+        if not service:
+            print(json.dumps({"error": f"Service '{service_key}' nicht in services.yaml gefunden."}))
+            sys.exit(1)
+        report = build_service_report(service_key, service, all_services, meta)
+        print(json.dumps(report, indent=2, ensure_ascii=False))
+        return
+
+    # Kein Argument: alle unhealthy Container automatisch finden
+    unhealthy_names = get_unhealthy_containers()
+
+    if not unhealthy_names:
+        print(json.dumps({"status": "all_healthy", "timestamp": datetime.now().isoformat()}))
+        return
+
+    reports = []
+    for container_name in unhealthy_names:
+        # Container-Name auf Service-Key mappen
+        service_key = None
+        service = None
+        for key, svc in all_services.items():
+            if svc["container_name"] == container_name:
+                service_key = key
+                service = svc
+                break
+
+        if not service:
+            reports.append({
+                "service": container_name,
+                "description": "Unbekannter Container (nicht in services.yaml)",
+                "tier": None,
+                "container": {"name": container_name, "status": "unhealthy", "health": "unknown", "healthy": False},
+                "dependencies": {},
+                "unhealthy_deps": [],
+                "dump": None,
+                "first_check": "Container nicht in services.yaml — manuell pruefen",
+                "notes": "services.yaml aktualisieren wenn dieser Container produktiv ist",
+                "timestamp": datetime.now().isoformat(),
+            })
+            continue
+
+        reports.append(build_service_report(service_key, service, all_services, meta))
+
+    print(json.dumps(reports, indent=2, ensure_ascii=False))
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,592 @@
+# services.yaml — Maschinenlesbare Wissensbasis fuer Hermes Alert Enrichment
+#
+# Abgeleitet aus docs/SERVICE_CATALOG.md
+# Stand: 2026-05-06
+#
+# Zweck: Hermes laedt diese Datei beim Alert-Anreichern, um Abhaengigkeiten,
+#        Dump-Zeitstempel und den ersten Diagnoseschritt nachzuschlagen.
+#
+# Felder:
+#   description    - Kurzbeschreibung des Dienstes
+#   tier           - Kritikalitaet: 1=Control Plane, 2=User Apps, 3=Ops/Tools
+#   category       - core | security | infra | app | ops
+#   container_name - exakter Docker-Containername (fuer docker inspect)
+#   dependencies   - Liste direkter Laufzeit-Abhaengigkeiten (andere Service-Keys)
+#   url            - oeffentliche URL (null = intern/LAN only)
+#   dump_file      - Dateiname in /mnt/user/backups/borg/dumps/latest/ (null = kein Dump)
+#   data_paths     - kritische Datenpfade auf dem Host
+#   first_check    - erster Diagnoseschritt bei Ausfall (Freitext fuer Hermes)
+#   notes          - betriebliche Hinweise und dokumentierte Ausnahmen
+
+meta:
+  dump_base: /mnt/user/backups/borg/dumps/latest
+  appdata_base: /mnt/user/appdata
+  secrets_path: /mnt/user/appdata/secrets
+
+# ---------------------------------------------------------------------------
+# TIER 1 — Control Plane (Ausfall blockiert alles darunter)
+# ---------------------------------------------------------------------------
+
+services:
+
+  traefik:
+    description: Zentraler Reverse Proxy, TLS, Docker-Label-Routing
+    tier: 1
+    category: core
+    container_name: traefik
+    dependencies: []
+    url: https://traefik.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/traefik/dynamic
+      - /mnt/user/appdata/traefik/letsencrypt
+    first_check: "Host-Ports 80/443 erreichbar? dynamic/ korrekt auf Host synchronisiert?"
+    notes: "dynamic configs werden NICHT automatisch von Komodo deployed — manueller Host-Sync noetig"
+
+  adguard:
+    description: DNS-Server / LAN DNS
+    tier: 1
+    category: core
+    container_name: adguard
+    dependencies:
+      - unbound
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/adguard/conf
+      - /mnt/user/appdata/adguard/work
+    first_check: "Port 53 erreichbar? Unbound healthy? dns_net Konnektivitaet?"
+    notes: "Ports 53 und 8082 dokumentierte Host-Port-Ausnahmen"
+
+  unbound:
+    description: Upstream DNS Resolver fuer AdGuard
+    tier: 1
+    category: core
+    container_name: unbound
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/unbound/config
+    first_check: "dns_net Konnektivitaet pruefen; Container-Logs auf Fehler pruefen"
+    notes: "rebuildbar; isoliert in dns_net"
+
+  tailscale:
+    description: VPN / Remote-Zugang
+    tier: 1
+    category: core
+    container_name: tailscale
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/tailscale
+    first_check: "Tailscale Status auf Host pruefen; State-Datei fuer Key-Renewal vorhanden?"
+    notes: "network_mode: host; NET_ADMIN, NET_RAW, /dev/net/tun — dokumentierte VPN-Ausnahmen"
+
+  gitea:
+    description: Git-Server — operative Quelle der Wahrheit fuer GitOps
+    tier: 1
+    category: core
+    container_name: gitea
+    dependencies:
+      - traefik
+    url: https://git.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/services/gitea/data
+    first_check: "HTTPS erreichbar? SQLite in /data intakt? SSH-Port 222 erreichbar?"
+    notes: "SQLite in /data — kein separater Dump; ohne externen Mirror im DR kritisch"
+
+  authelia:
+    description: ForwardAuth — zentrale Authentifizierung fuer Admin-UIs
+    tier: 1
+    category: security
+    container_name: authelia
+    dependencies:
+      - postgresql17
+      - traefik
+    url: https://auth.kaleschke.info
+    dump_file: postgresql17-authelia.dump
+    data_paths:
+      - /mnt/user/appdata/authelia/config
+    first_check: "PostgreSQL healthy? SMTP via GMX erreichbar? Host-Config aktuell (Repo-Baseline != Host)?"
+    notes: "kein Redis-Session-Backend; SMTP-Notifier GMX; Repo-Baseline muss manuell in Host-Config gemerged werden"
+
+  vaultwarden:
+    description: Passwort-Tresor
+    tier: 1
+    category: security
+    container_name: vaultwarden
+    dependencies:
+      - traefik
+    url: https://vault.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/vaultwarden
+    first_check: "HTTPS erreichbar? Appdata-Volume intakt?"
+    notes: "ADMIN_TOKEN_FILE; keine direkten Host-Ports"
+
+  postgresql17:
+    description: Shared PostgreSQL Cluster (Authelia, Paperless, Mail-Archiver, Mealie, Komodo indirekt)
+    tier: 1
+    category: infra
+    container_name: postgresql17
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/postgresql17
+    first_check: "backend_net Konnektivitaet? Disk-Space auf /mnt/user/appdata? pg_isready im Container?"
+    notes: "Dumps per Dienst unter dumps/latest; raw DB nicht primaerer Restore-Weg"
+
+  komodo-core:
+    description: GitOps UI / API / Stack-Manager
+    tier: 1
+    category: ops
+    container_name: komodo-core
+    dependencies:
+      - komodo-mongo
+      - gitea
+      - traefik
+    url: https://komodo.kaleschke.info
+    dump_file: komodo-mongo.archive.gz
+    data_paths:
+      - /mnt/user/appdata/komodo/core
+    first_check: "MongoDB healthy? Gitea erreichbar? komodo_net Konnektivitaet?"
+    notes: "keine pauschale Authelia-ForwardAuth; Gitea DNS override konfiguriert"
+
+  komodo-mongo:
+    description: Komodo Datenbank (MongoDB)
+    tier: 1
+    category: infra
+    container_name: komodo-mongo
+    dependencies: []
+    url: null
+    dump_file: komodo-mongo.archive.gz
+    data_paths:
+      - /mnt/user/appdata/komodo/mongo
+    first_check: "komodo_net Konnektivitaet? Disk-Space? mongosh ping?"
+    notes: "Dump-Integritaet nach Major-Upgrades pruefen"
+
+  komodo-periphery:
+    description: Komodo Host-Agent (Stack-Deployments)
+    tier: 1
+    category: ops
+    container_name: komodo-periphery
+    dependencies:
+      - komodo-core
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/komodo/periphery
+    first_check: "Docker-Socket lesbar? /mnt/user/services gemountet? komodo_net Verbindung zu Core?"
+    notes: "Docker-Socket-Ausnahme dokumentiert; /mnt/user/services Mount fuer Stack-Workspaces"
+
+# ---------------------------------------------------------------------------
+# TIER 2 — User Apps
+# ---------------------------------------------------------------------------
+
+  redis:
+    description: Shared Redis Cache (Paperless, weitere)
+    tier: 2
+    category: infra
+    container_name: redis
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/redis
+    first_check: "backend_net Konnektivitaet? redis-cli ping erreichbar?"
+    notes: "transiente Daten; bewusst nicht Backup-kritisch"
+
+  paperless-ngx:
+    description: Dokumentenmanagement
+    tier: 2
+    category: app
+    container_name: paperless-ngx
+    dependencies:
+      - postgresql17
+      - redis
+      - traefik
+    url: https://paperless.kaleschke.info
+    dump_file: postgresql17-paperless.dump
+    data_paths:
+      - /mnt/user/appdata/paperless-ngx/data
+      - /mnt/user/documents/paperless
+      - /mnt/user/documents/scans_inbox
+    first_check: "Redis healthy? PostgreSQL healthy? backend_net Konnektivitaet?"
+    notes: "DB/Redis Secrets als Stack ENV (keine _FILE Variante)"
+
+  paperless-gpt:
+    description: KI-Ergaenzung fuer Paperless (OCR/Tagging via LLM)
+    tier: 2
+    category: app
+    container_name: paperless-gpt
+    dependencies:
+      - paperless-ngx
+      - traefik
+    url: https://paperless-gpt.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/paperless-gpt/data
+      - /mnt/user/appdata/paperless-gpt/prompts
+    first_check: "Paperless API erreichbar? LLM/Ollama erreichbar? API Token gesetzt?"
+    notes: "API Token als Stack ENV; abhaengig von laufendem Paperless"
+
+  immich_server:
+    description: Foto-/Video-App
+    tier: 2
+    category: app
+    container_name: immich_server
+    dependencies:
+      - immich_postgres
+      - immich_redis
+      - immich_machine_learning
+      - traefik
+    url: https://immich.kaleschke.info
+    dump_file: immich.dump
+    data_paths:
+      - /mnt/user/photos/immich
+      - /mnt/user/photos/family_archive
+    first_check: "immich_postgres healthy? immich_redis healthy? ML-Container healthy? immich_default Netz?"
+    notes: "native App-Auth; externes Fotoarchiv gemountet"
+
+  immich_postgres:
+    description: Immich-Datenbank
+    tier: 2
+    category: infra
+    container_name: immich_postgres
+    dependencies: []
+    url: null
+    dump_file: immich.dump
+    data_paths:
+      - /mnt/user/appdata/immich_postgres
+    first_check: "immich_default Netz? Disk-Space? pg_isready?"
+    notes: "nie ins frontend_net; immich_default Netz isoliert"
+
+  immich_redis:
+    description: Immich Cache
+    tier: 2
+    category: infra
+    container_name: immich_redis
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths: []
+    first_check: "immich_default Netz? redis-cli ping?"
+    notes: "rebuildbar; anonymes Volume — named volume als offenes TODO"
+
+  immich_machine_learning:
+    description: Immich ML (Gesichtserkennung, Suche)
+    tier: 2
+    category: infra
+    container_name: immich_machine_learning
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - model-cache
+    first_check: "immich_default Netz? model-cache Volume vorhanden?"
+    notes: "rebuildbar; intern-only"
+
+  mealie:
+    description: Rezeptverwaltung
+    tier: 2
+    category: app
+    container_name: mealie
+    dependencies:
+      - mealie-postgres
+      - traefik
+    url: https://mealie.kaleschke.info
+    dump_file: mealie.dump
+    data_paths:
+      - /mnt/user/appdata/mealie/data
+    first_check: "mealie-postgres healthy? mealie_internal Netz erreichbar?"
+    notes: "App + DB in internem Netz getrennt (mealie_internal)"
+
+  mealie-postgres:
+    description: Mealie-Datenbank
+    tier: 2
+    category: infra
+    container_name: mealie-postgres
+    dependencies: []
+    url: null
+    dump_file: mealie.dump
+    data_paths:
+      - /mnt/user/appdata/mealie/postgres
+    first_check: "mealie_internal Netz? Disk-Space?"
+    notes: "interne DB; mealie_internal Netz"
+
+  mail-archiver:
+    description: Mail-Archivierung (IMAP)
+    tier: 2
+    category: app
+    container_name: mail-archiver
+    dependencies:
+      - postgresql17
+      - authelia
+      - traefik
+    url: https://mail.kaleschke.info
+    dump_file: postgresql17-mailarchiver.dump
+    data_paths:
+      - /mnt/user/appdata/mailarchiver/data-protection-keys
+    first_check: "PostgreSQL healthy? Internet-/IMAP-Zugang? Authelia healthy?"
+    notes: "Hybrid: frontend_net fuer IMAP/Internet, backend_net fuer DB"
+
+  nextcloud:
+    description: Datei-/Cloud-Dienst
+    tier: 2
+    category: app
+    container_name: nextcloud
+    dependencies:
+      - nextcloud-postgres
+      - nextcloud-redis
+      - traefik
+    url: https://cloud.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/nextcloud/html
+      - /mnt/user/documents/nextcloud-data
+    first_check: "nextcloud-postgres healthy? nextcloud-redis healthy? nextcloud_internal Netz?"
+    notes: "native App-Auth (kein zentrales ForwardAuth); WebDAV/CardDAV beachten"
+
+  nextcloud-postgres:
+    description: Nextcloud-Datenbank
+    tier: 2
+    category: infra
+    container_name: nextcloud-postgres
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/nextcloud/postgres
+    first_check: "nextcloud_internal Netz? Disk-Space?"
+    notes: "interne DB"
+
+  nextcloud-redis:
+    description: Nextcloud Cache / Locking
+    tier: 2
+    category: infra
+    container_name: nextcloud-redis
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/nextcloud/redis
+    first_check: "nextcloud_internal Netz? redis-cli ping?"
+    notes: "rebuildbar"
+
+  ntfy:
+    description: Push-Benachrichtigungen (Alert-Backbone)
+    tier: 2
+    category: app
+    container_name: ntfy
+    dependencies:
+      - traefik
+    url: https://ntfy.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/ntfy
+    first_check: "HTTPS erreichbar? NTFY_BEHIND_PROXY=true gesetzt? Traefik healthy?"
+    notes: "KRITISCH: Ausfall bedeutet keine anderen Alerts ankommen; Monitoring/Borg-Benachrichtigungen"
+
+# ---------------------------------------------------------------------------
+# TIER 3 — Ops / Tools (Ausfall schmerzt, blockiert nichts Kritisches)
+# ---------------------------------------------------------------------------
+
+  homepage:
+    description: Start-Dashboard
+    tier: 3
+    category: ops
+    container_name: homepage
+    dependencies:
+      - traefik
+    url: https://home.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/homepage
+    first_check: "Traefik erreichbar? Docker-Socket read-only lesbar? API-Tokens gueltig?"
+    notes: "Docker socket read-only; viele API Tokens in Config"
+
+  uptime-kuma:
+    description: Monitoring / Uptime Checks
+    tier: 3
+    category: ops
+    container_name: UptimeKuma
+    dependencies:
+      - traefik
+    url: https://uptime.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/uptime-kuma
+    first_check: "Datenbank-Volume intakt? Traefik erreichbar?"
+    notes: "Monitore nach Restore manuell pruefen"
+
+  grafana:
+    description: Metrik-Dashboard
+    tier: 3
+    category: ops
+    container_name: grafana
+    dependencies:
+      - influxdb3-core
+      - traefik
+    url: https://grafana.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/grafana
+    first_check: "influxdb3-core healthy? Datasource-Token in Secret gesetzt? Provisioning-Konfig vorhanden?"
+    notes: "laeuft als user 0 wegen Host-Appdata-Permissions (dokumentiert); Datasource wird provisioniert"
+
+  influxdb3-core:
+    description: Zeitreihen- / Metrikdaten fuer Grafana und Home Assistant
+    tier: 3
+    category: ops
+    container_name: influxdb3-core
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/influxdb3/data
+      - /mnt/user/appdata/influxdb3/plugins
+    first_check: "LAN-Port 8181 erreichbar? 401 ohne Token = OK (erwartet). Disk-Space?"
+    notes: "LAN-only Host-Port 8181; kein frontend_net; laeuft als user 0"
+
+  scrutiny:
+    description: Laufwerks- / SMART-Monitoring
+    tier: 3
+    category: ops
+    container_name: scrutiny
+    dependencies:
+      - traefik
+    url: https://scrutiny.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/scrutiny/config
+      - /mnt/user/appdata/scrutiny/influxdb
+    first_check: "Device-Mounts vorhanden? privileged=true gesetzt? Traefik erreichbar?"
+    notes: "privileged: true dokumentierte Ausnahme"
+
+  glances:
+    description: System- / Container-Monitoring
+    tier: 3
+    category: ops
+    container_name: glances
+    dependencies:
+      - traefik
+    url: https://glances.kaleschke.info
+    dump_file: null
+    data_paths: []
+    first_check: "Docker-Socket lesbar? rootfs gemountet? Traefik erreichbar?"
+    notes: "rebuildbar; Docker-Socket und rootfs Mounts"
+
+  borg-ui:
+    description: Borg Backup- / Restore UI
+    tier: 3
+    category: ops
+    container_name: borg-ui
+    dependencies:
+      - traefik
+    url: https://borg.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/borg-ui/data
+      - /mnt/user/backups/borg/dumps
+    first_check: "Borg-Repo-Credentials vorhanden? Backup-Mounts erreichbar? Traefik healthy?"
+    notes: "breite Mounts bewusst dokumentiert; /local/secrets im DR-Scope"
+
+  backrest:
+    description: Backup-Admin-Dienst (Legacy-Backup-Ebene)
+    tier: 3
+    category: ops
+    container_name: backrest
+    dependencies:
+      - traefik
+    url: https://backrest.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/backrest
+    first_check: "Repo/SSH-Mounts erreichbar? Traefik healthy?"
+    notes: "breite Mounts bewusst dokumentiert"
+
+  hermes-gateway:
+    description: Hermes Agent Gateway / AI Ops Assistant
+    tier: 3
+    category: ops
+    container_name: hermes-gateway
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/hermes-agent/data
+    first_check: "hermes_net:8642/health erreichbar? SSH-Key gemountet? LLM-Provider erreichbar?"
+    notes: "kein Docker-Socket; SSH terminal backend; echte .env auf Host-Appdata"
+
+  ddns-updater:
+    description: Cloudflare / DDNS Aktualisierung
+    tier: 3
+    category: infra
+    container_name: ddns-updater
+    dependencies: []
+    url: null
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/ddns-updater
+    first_check: "Internetzugang? Cloudflare API erreichbar? Config vorhanden?"
+    notes: "bewusst in frontend_net weil backend_net internal ist"
+
+  code-server:
+    description: Web-Editor / Operations Workspace
+    tier: 3
+    category: ops
+    container_name: code-server
+    dependencies:
+      - traefik
+    url: https://code.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/code-server
+      - /mnt/user/services/dev
+    first_check: "Traefik erreichbar? PASSWORD_FILE lesbar?"
+    notes: "PASSWORD_FILE; Workspaces bei Restore beachten"
+
+  filebrowser:
+    description: Datei-Browser fuer Appdata
+    tier: 3
+    category: ops
+    container_name: filebrowser
+    dependencies:
+      - traefik
+    url: https://files.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/filebrowser
+    first_check: "Appdata-Mounts erreichbar? Traefik healthy?"
+    notes: "breiter /mnt/user/appdata Mount; Einschraenkung langfristig als TODO"
+
+  speedtest-tracker:
+    description: Speedtest-Monitoring
+    tier: 3
+    category: ops
+    container_name: speedtest-tracker
+    dependencies:
+      - traefik
+    url: https://speedtest.kaleschke.info
+    dump_file: null
+    data_paths:
+      - /mnt/user/appdata/speedtest-tracker/config
+    first_check: "APP_KEY gesetzt? Internetzugang fuer Speedtest vorhanden?"
+    notes: "APP_KEY, ADMIN_PASSWORD als Stack ENV"
+
+  bentopdf:
+    description: PDF-Tooling
+    tier: 3
+    category: app
+    container_name: bentopdf
+    dependencies:
+      - traefik
+    url: https://pdf.kaleschke.info
+    dump_file: null
+    data_paths: []
+    first_check: "COOP/COEP Middleware gesetzt? Traefik healthy?"
+    notes: "rebuildbar; keine kritische Persistenz; Live-Status pruefen"
@@ -0,0 +1,153 @@
+# Skill: homelab-ops-monitor
+
+## Zweck
+
+Dieser Skill macht Hermes zum kontextuellen Ops-Assistenten fuer das Kallilabcore-Homelab.
+Wenn ein Container unhealthy wird, liefert dieser Skill keine rohe Fehlermeldung,
+sondern einen angereicherten Alert: Was ist kaputt, welche Abhaengigkeiten sind betroffen,
+wie alt ist der letzte Backup-Dump, und was ist der erste konkrete Diagnoseschritt.
+
+---
+
+## Wann aktivieren
+
+- Wenn ein Container unhealthy gemeldet wird (manuell oder via Cronjob)
+- Wenn der Benutzer fragt: "Was ist kaputt?" / "Was ist mit [Service]?"
+- Wenn ein proaktiver Health-Check ausgefuehrt werden soll
+- Wenn ein ntfy-Alert angereichert werden soll bevor er gesendet wird
+
+---
+
+## Kernprinzipien
+
+1. **Immer check_health.py ausfuehren** — nie raten, immer messen.
+2. **Kontext aus services.yaml** — Abhaengigkeiten und Dump-Info sind dort definiert.
+3. **ntfy-Alert nur wenn wirklich etwas unhealthy ist** — kein Alert-Spam.
+4. **Tier 1 = urgent, Tier 2 = high, Tier 3 = default** — ntfy Priority entsprechend setzen.
+5. **Kein Schreiben, kein Neustart** — dieser Skill diagnostiziert, handelt nicht.
+
+---
+
+## Ausfuehrungsschritte
+
+### Schritt 1 — Health-Check ausfuehren
+
+Fuehre via Terminal (SSH) auf dem Host aus:
+
+```bash
+python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
+```
+
+Fuer einen gezielten Service:
+```bash
+python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py <service-key>
+```
+
+Fuer den Gesamtstatus (Tier 1+2):
+```bash
+python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py --summary
+```
+
+### Schritt 2 — JSON-Output interpretieren
+
+Der Report enthaelt je Service:
+- `tier` — Kritikalitaet (1=Control Plane, 2=App, 3=Ops)
+- `container.healthy` — aktueller Gesundheitsstatus
+- `unhealthy_deps` — Liste der ebenfalls unhealthy Abhaengigkeiten
+- `dump.age_hours` — Alter des letzten Dumps in Stunden (>26h = Warnung)
+- `dump.warn` — true wenn Dump veraltet
+- `first_check` — erster Diagnoseschritt laut service catalog
+- `notes` — betriebliche Hinweise
+
+### Schritt 3 — ntfy-Alert bauen
+
+Baue eine ntfy-Nachricht nach diesem Format:
+
+```
+[Titel]
+[Tier-Emoji] [service-key] unhealthy (Tier [N])
+
+Beschreibung: [description]
+
+Abhaengigkeiten:
+  [✅/❌] [dep-key] — [status]
+
+Letzter Dump: [age_hours]h alt [✅/⚠️]  (oder: kein Dump konfiguriert)
+
+Erster Check:
+  [first_check]
+
+Hinweis: [notes]
+```
+
+Tier-Emojis: Tier 1 = 🔴, Tier 2 = 🟠, Tier 3 = 🟡
+Dump-Warnschwelle: >26 Stunden = ⚠️
+
+### Schritt 4 — ntfy senden
+
+```bash
+curl -s \
+  -H "Title: [Tier N] [service-key] unhealthy" \
+  -H "Priority: [urgent|high|default]" \
+  -H "Tags: [warning,tier1|tier2|tier3]" \
+  -d "[message]" \
+  https://ntfy.kaleschke.info/homelab-alerts
+```
+
+ntfy Prioritaeten:
+- Tier 1 → `urgent`
+- Tier 2 → `high`
+- Tier 3 → `default`
+
+---
+
+## Sonderfaelle
+
+### Unbekannter Container (nicht in services.yaml)
+-> Alert senden mit Hinweis "nicht in services.yaml — bitte aktualisieren"
+-> services.yaml Pfad: `/mnt/user/services/homelab/ops/hermes-agent/services.yaml`
+
+### ntfy selbst ist unhealthy
+-> Alert kann nicht per ntfy gesendet werden
+-> Hermes sendet stattdessen via Telegram (falls konfiguriert)
+-> Nachricht: "KRITISCH: ntfy ist unhealthy — kein Push-Alerting aktiv"
+
+### Alle Tier-1-Abhaengigkeiten unhealthy
+-> Wahrscheinlich kein isoliertes Problem — Host oder Netzwerk pruefen
+-> Zusammenfassenden Alert senden statt Einzel-Alerts
+
+### check_health.py nicht gefunden
+-> Meldung: "Script nicht gefunden unter /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py"
+-> Pruefe ob Komodo den Stack zuletzt deployed hat
+
+---
+
+## Cronjob-Empfehlung
+
+Fuer automatische Checks ohne Uptime-Kuma-Webhook:
+
+```
+# Jede Stunde — prueft alle unhealthy Container
+0 * * * * python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
+
+# Taeglich 07:00 — Gesamtstatus Tier 1+2
+0 7 * * * python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py --summary
+```
+
+---
+
+## Nicht-Ziele dieses Skills
+
+- **Kein automatischer Neustart** von Containern
+- **Kein Schreiben** in Compose-Dateien oder Konfigurationen
+- **Kein Deploy** via Komodo
+- **Keine Diagnose-Tiefe** jenseits des `first_check`-Hinweises (das ist Aufgabe des Benutzers)
+
+---
+
+## Verwandte Skills und Ressourcen
+
+- `kallilab-homelab-ops` — Governance-Skill fuer Aenderungsentscheidungen
+- `services.yaml` — Wissensbasis: `/mnt/user/services/homelab/ops/hermes-agent/services.yaml`
+- `check_health.py` — Ausfuehrungs-Script: `/mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py`
+- Repo: `https://git.kaleschke.info` (origin/master)