#!/usr/bin/env python3 """ check_health.py — Homelab Alert Enricher ========================================= Laedt services.json, prueft Docker-Health aller bekannten Abhaengigkeiten, liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus. Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht. Keine externen Abhaengigkeiten — nur Python-Standardbibliothek. Verwendung: python3 check_health.py # alle unhealthy Container python3 check_health.py paperless-ngx # gezielt einen Service pruefen python3 check_health.py --summary # Gesamtstatus als Zusammenfassung Pfad auf der Hermes-VM (via git pull): /srv/hermes-workspace/homelab-infra/ops/hermes-agent/scripts/check_health.py services.json wird relativ zum Script-Verzeichnis gesucht: ../services.json """ import json import os import subprocess import sys from datetime import datetime from pathlib import Path # --------------------------------------------------------------------------- # Konfiguration # --------------------------------------------------------------------------- SCRIPT_DIR = Path(__file__).parent.resolve() SERVICES_JSON_PATH = SCRIPT_DIR.parent / "services.json" # Fallback falls das Repo unter einem anderen Pfad liegt SERVICES_JSON_FALLBACK = Path("/srv/hermes-workspace/homelab-infra/ops/hermes-agent/services.json") # Dump-Warnschwelle in Stunden (aelter = Warnung) DUMP_WARN_HOURS = 26 # --------------------------------------------------------------------------- # Hilfsfunktionen # --------------------------------------------------------------------------- def load_services(): """Laedt services.json. Gibt (services_dict, meta_dict) zurueck. Keine externen Abhaengigkeiten — verwendet nur json aus der Standardbibliothek.""" path = SERVICES_JSON_PATH if SERVICES_JSON_PATH.exists() else SERVICES_JSON_FALLBACK if not path.exists(): raise FileNotFoundError( f"services.json nicht gefunden: {path}\n" f"Bitte 'git pull' in /srv/hermes-workspace/homelab-infra/ ausfuehren." ) with open(path, encoding="utf-8") as f: data = json.load(f) return data.get("services", {}), data.get("meta", {}) def docker_inspect(container_name: str) -> dict: """ Gibt {'status': str, 'health': str} zurueck. status: running | exited | restarting | dead | not_found | error health: healthy | unhealthy | starting | none | unknown """ try: result = subprocess.run( [ "docker", "inspect", "--format", "{{.State.Status}}|||{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}", container_name, ], capture_output=True, text=True, timeout=10, ) if result.returncode != 0: return {"status": "not_found", "health": "unknown"} parts = result.stdout.strip().split("|||") return { "status": parts[0].strip() if parts else "unknown", "health": parts[1].strip() if len(parts) > 1 else "none", } except Exception as e: return {"status": "error", "health": str(e)} def is_healthy(inspect_result: dict) -> bool: status = inspect_result.get("status", "") health = inspect_result.get("health", "") if status != "running": return False if health in ("unhealthy",): return False return True def get_unhealthy_containers() -> list[str]: """Gibt Liste aller Container zurueck die unhealthy oder nicht running sind.""" try: # unhealthy per healthcheck r1 = subprocess.run( ["docker", "ps", "--filter", "health=unhealthy", "--format", "{{.Names}}"], capture_output=True, text=True, timeout=10, ) # exited/dead Container die eigentlich laufen sollten r2 = subprocess.run( ["docker", "ps", "--filter", "status=exited", "--format", "{{.Names}}"], capture_output=True, text=True, timeout=10, ) names = set() for raw in (r1.stdout, r2.stdout): for name in raw.strip().split("\n"): name = name.strip() if name: names.add(name) return sorted(names) except Exception: return [] def get_dump_info(dump_file: str | None, dump_base: str) -> dict | None: """Gibt Alter und Groesse des Dump-Files zurueck (oder None wenn nicht vorhanden).""" if not dump_file: return None path = Path(dump_base) / dump_file if not path.exists(): return {"file": dump_file, "exists": False, "age_hours": None, "size_mb": None} stat = path.stat() age_hours = round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1) size_mb = round(stat.st_size / 1_048_576, 1) return { "file": dump_file, "exists": True, "age_hours": age_hours, "size_mb": size_mb, "warn": age_hours > DUMP_WARN_HOURS, } # --------------------------------------------------------------------------- # Report-Generierung # --------------------------------------------------------------------------- def build_service_report(service_key: str, service: dict, all_services: dict, meta: dict) -> dict: """Erstellt einen vollstaendigen Report fuer einen einzelnen Service.""" dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest") # Eigener Container-Status own_inspect = docker_inspect(service["container_name"]) own_healthy = is_healthy(own_inspect) # Abhaengigkeits-Check dep_results = {} for dep_key in service.get("dependencies", []): dep = all_services.get(dep_key) if not dep: dep_results[dep_key] = {"status": "unknown_service", "health": "unknown", "healthy": False} continue insp = docker_inspect(dep["container_name"]) dep_results[dep_key] = { **insp, "healthy": is_healthy(insp), "tier": dep.get("tier"), "container_name": dep["container_name"], } unhealthy_deps = [k for k, v in dep_results.items() if not v["healthy"]] # Dump-Info dump_info = get_dump_info(service.get("dump_file"), dump_base) return { "service": service_key, "description": service.get("description", ""), "tier": service.get("tier"), "url": service.get("url"), "container": { "name": service["container_name"], **own_inspect, "healthy": own_healthy, }, "dependencies": dep_results, "unhealthy_deps": unhealthy_deps, "dump": dump_info, "first_check": service.get("first_check", ""), "notes": service.get("notes", ""), "timestamp": datetime.now().isoformat(), } def build_summary_report(all_services: dict, meta: dict) -> dict: """Prueft alle Tier-1 und Tier-2 Dienste und gibt einen Gesamtstatus zurueck.""" results = {} issues = [] for key, svc in all_services.items(): tier = svc.get("tier", 3) if tier > 2: continue # Tier-3 im Summary ueberspringen insp = docker_inspect(svc["container_name"]) healthy = is_healthy(insp) results[key] = { "tier": tier, "healthy": healthy, "status": insp["status"], "health": insp["health"], } if not healthy: issues.append({"service": key, "tier": tier, **insp}) # Dump-Checks fuer alle Dienste mit dump_file dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest") stale_dumps = [] for key, svc in all_services.items(): info = get_dump_info(svc.get("dump_file"), dump_base) if info and info.get("warn"): stale_dumps.append({ "service": key, "file": info["file"], "age_hours": info["age_hours"], }) return { "mode": "summary", "timestamp": datetime.now().isoformat(), "services_checked": len(results), "issues": issues, "stale_dumps": stale_dumps, "overall_healthy": len(issues) == 0 and len(stale_dumps) == 0, } # --------------------------------------------------------------------------- # Einstiegspunkt # --------------------------------------------------------------------------- def main(): args = sys.argv[1:] all_services, meta = load_services() if "--summary" in args: report = build_summary_report(all_services, meta) print(json.dumps(report, indent=2, ensure_ascii=False)) return # Expliziter Service-Key als Argument if args and not args[0].startswith("--"): service_key = args[0] service = all_services.get(service_key) if not service: print(json.dumps({"error": f"Service '{service_key}' nicht in services.yaml gefunden."})) sys.exit(1) report = build_service_report(service_key, service, all_services, meta) print(json.dumps(report, indent=2, ensure_ascii=False)) return # Kein Argument: alle unhealthy Container automatisch finden unhealthy_names = get_unhealthy_containers() if not unhealthy_names: print(json.dumps({"status": "all_healthy", "timestamp": datetime.now().isoformat()})) return reports = [] for container_name in unhealthy_names: # Container-Name auf Service-Key mappen service_key = None service = None for key, svc in all_services.items(): if svc["container_name"] == container_name: service_key = key service = svc break if not service: reports.append({ "service": container_name, "description": "Unbekannter Container (nicht in services.yaml)", "tier": None, "container": {"name": container_name, "status": "unhealthy", "health": "unknown", "healthy": False}, "dependencies": {}, "unhealthy_deps": [], "dump": None, "first_check": "Container nicht in services.yaml — manuell pruefen", "notes": "services.yaml aktualisieren wenn dieser Container produktiv ist", "timestamp": datetime.now().isoformat(), }) continue reports.append(build_service_report(service_key, service, all_services, meta)) print(json.dumps(reports, indent=2, ensure_ascii=False)) if __name__ == "__main__": main()