Files
homelab-infra/ops/hermes-agent/scripts/check_health.py
T
Micha 0aa8138bdd hermes update
hermes update
2026-05-06 19:13:52 +02:00

313 lines
10 KiB
Python

#!/usr/bin/env python3
"""
check_health.py — Homelab Alert Enricher
=========================================
Laedt services.yaml, prueft Docker-Health aller bekannten Abhaengigkeiten,
liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus.
Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht.
Verwendung:
python3 check_health.py # alle unhealthy Container
python3 check_health.py paperless-ngx # gezielt einen Service pruefen
python3 check_health.py --summary # Gesamtstatus als Zusammenfassung
Pfad auf Host (via Komodo-Clone):
/mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
services.yaml wird relativ zum Script-Verzeichnis gesucht:
../services.yaml
"""
import json
import os
import subprocess
import sys
from datetime import datetime
from pathlib import Path
# ---------------------------------------------------------------------------
# Konfiguration
# ---------------------------------------------------------------------------
SCRIPT_DIR = Path(__file__).parent.resolve()
SERVICES_YAML_PATH = SCRIPT_DIR.parent / "services.yaml"
# Fallback falls das Repo unter einem anderen Pfad liegt
SERVICES_YAML_FALLBACK = Path("/mnt/user/services/homelab/ops/hermes-agent/services.yaml")
# Dump-Warnschwelle in Stunden (aelter = Warnung)
DUMP_WARN_HOURS = 26
# ---------------------------------------------------------------------------
# Hilfsfunktionen
# ---------------------------------------------------------------------------
def load_services():
"""Laedt services.yaml. Gibt (services_dict, meta_dict) zurueck."""
try:
import yaml
except ImportError:
# PyYAML nicht installiert — minimaler Fallback ueber pip
subprocess.run(
[sys.executable, "-m", "pip", "install", "pyyaml", "-q"],
check=True
)
import yaml
path = SERVICES_YAML_PATH if SERVICES_YAML_PATH.exists() else SERVICES_YAML_FALLBACK
if not path.exists():
raise FileNotFoundError(f"services.yaml nicht gefunden: {path}")
with open(path) as f:
data = yaml.safe_load(f)
return data.get("services", {}), data.get("meta", {})
def docker_inspect(container_name: str) -> dict:
"""
Gibt {'status': str, 'health': str} zurueck.
status: running | exited | restarting | dead | not_found | error
health: healthy | unhealthy | starting | none | unknown
"""
try:
result = subprocess.run(
[
"docker", "inspect",
"--format",
"{{.State.Status}}|||{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}",
container_name,
],
capture_output=True,
text=True,
timeout=10,
)
if result.returncode != 0:
return {"status": "not_found", "health": "unknown"}
parts = result.stdout.strip().split("|||")
return {
"status": parts[0].strip() if parts else "unknown",
"health": parts[1].strip() if len(parts) > 1 else "none",
}
except Exception as e:
return {"status": "error", "health": str(e)}
def is_healthy(inspect_result: dict) -> bool:
status = inspect_result.get("status", "")
health = inspect_result.get("health", "")
if status != "running":
return False
if health in ("unhealthy",):
return False
return True
def get_unhealthy_containers() -> list[str]:
"""Gibt Liste aller Container zurueck die unhealthy oder nicht running sind."""
try:
# unhealthy per healthcheck
r1 = subprocess.run(
["docker", "ps", "--filter", "health=unhealthy", "--format", "{{.Names}}"],
capture_output=True, text=True, timeout=10,
)
# exited/dead Container die eigentlich laufen sollten
r2 = subprocess.run(
["docker", "ps", "--filter", "status=exited", "--format", "{{.Names}}"],
capture_output=True, text=True, timeout=10,
)
names = set()
for raw in (r1.stdout, r2.stdout):
for name in raw.strip().split("\n"):
name = name.strip()
if name:
names.add(name)
return sorted(names)
except Exception:
return []
def get_dump_info(dump_file: str | None, dump_base: str) -> dict | None:
"""Gibt Alter und Groesse des Dump-Files zurueck (oder None wenn nicht vorhanden)."""
if not dump_file:
return None
path = Path(dump_base) / dump_file
if not path.exists():
return {"file": dump_file, "exists": False, "age_hours": None, "size_mb": None}
stat = path.stat()
age_hours = round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1)
size_mb = round(stat.st_size / 1_048_576, 1)
return {
"file": dump_file,
"exists": True,
"age_hours": age_hours,
"size_mb": size_mb,
"warn": age_hours > DUMP_WARN_HOURS,
}
# ---------------------------------------------------------------------------
# Report-Generierung
# ---------------------------------------------------------------------------
def build_service_report(service_key: str, service: dict, all_services: dict, meta: dict) -> dict:
"""Erstellt einen vollstaendigen Report fuer einen einzelnen Service."""
dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
# Eigener Container-Status
own_inspect = docker_inspect(service["container_name"])
own_healthy = is_healthy(own_inspect)
# Abhaengigkeits-Check
dep_results = {}
for dep_key in service.get("dependencies", []):
dep = all_services.get(dep_key)
if not dep:
dep_results[dep_key] = {"status": "unknown_service", "health": "unknown", "healthy": False}
continue
insp = docker_inspect(dep["container_name"])
dep_results[dep_key] = {
**insp,
"healthy": is_healthy(insp),
"tier": dep.get("tier"),
"container_name": dep["container_name"],
}
unhealthy_deps = [k for k, v in dep_results.items() if not v["healthy"]]
# Dump-Info
dump_info = get_dump_info(service.get("dump_file"), dump_base)
return {
"service": service_key,
"description": service.get("description", ""),
"tier": service.get("tier"),
"url": service.get("url"),
"container": {
"name": service["container_name"],
**own_inspect,
"healthy": own_healthy,
},
"dependencies": dep_results,
"unhealthy_deps": unhealthy_deps,
"dump": dump_info,
"first_check": service.get("first_check", ""),
"notes": service.get("notes", ""),
"timestamp": datetime.now().isoformat(),
}
def build_summary_report(all_services: dict, meta: dict) -> dict:
"""Prueft alle Tier-1 und Tier-2 Dienste und gibt einen Gesamtstatus zurueck."""
results = {}
issues = []
for key, svc in all_services.items():
tier = svc.get("tier", 3)
if tier > 2:
continue # Tier-3 im Summary ueberspringen
insp = docker_inspect(svc["container_name"])
healthy = is_healthy(insp)
results[key] = {
"tier": tier,
"healthy": healthy,
"status": insp["status"],
"health": insp["health"],
}
if not healthy:
issues.append({"service": key, "tier": tier, **insp})
# Dump-Checks fuer alle Dienste mit dump_file
dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
stale_dumps = []
for key, svc in all_services.items():
info = get_dump_info(svc.get("dump_file"), dump_base)
if info and info.get("warn"):
stale_dumps.append({
"service": key,
"file": info["file"],
"age_hours": info["age_hours"],
})
return {
"mode": "summary",
"timestamp": datetime.now().isoformat(),
"services_checked": len(results),
"issues": issues,
"stale_dumps": stale_dumps,
"overall_healthy": len(issues) == 0 and len(stale_dumps) == 0,
}
# ---------------------------------------------------------------------------
# Einstiegspunkt
# ---------------------------------------------------------------------------
def main():
args = sys.argv[1:]
all_services, meta = load_services()
if "--summary" in args:
report = build_summary_report(all_services, meta)
print(json.dumps(report, indent=2, ensure_ascii=False))
return
# Expliziter Service-Key als Argument
if args and not args[0].startswith("--"):
service_key = args[0]
service = all_services.get(service_key)
if not service:
print(json.dumps({"error": f"Service '{service_key}' nicht in services.yaml gefunden."}))
sys.exit(1)
report = build_service_report(service_key, service, all_services, meta)
print(json.dumps(report, indent=2, ensure_ascii=False))
return
# Kein Argument: alle unhealthy Container automatisch finden
unhealthy_names = get_unhealthy_containers()
if not unhealthy_names:
print(json.dumps({"status": "all_healthy", "timestamp": datetime.now().isoformat()}))
return
reports = []
for container_name in unhealthy_names:
# Container-Name auf Service-Key mappen
service_key = None
service = None
for key, svc in all_services.items():
if svc["container_name"] == container_name:
service_key = key
service = svc
break
if not service:
reports.append({
"service": container_name,
"description": "Unbekannter Container (nicht in services.yaml)",
"tier": None,
"container": {"name": container_name, "status": "unhealthy", "health": "unknown", "healthy": False},
"dependencies": {},
"unhealthy_deps": [],
"dump": None,
"first_check": "Container nicht in services.yaml — manuell pruefen",
"notes": "services.yaml aktualisieren wenn dieser Container produktiv ist",
"timestamp": datetime.now().isoformat(),
})
continue
reports.append(build_service_report(service_key, service, all_services, meta))
print(json.dumps(reports, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()