0aa8138bdd
hermes update
313 lines
10 KiB
Python
313 lines
10 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
check_health.py — Homelab Alert Enricher
|
|
=========================================
|
|
Laedt services.yaml, prueft Docker-Health aller bekannten Abhaengigkeiten,
|
|
liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus.
|
|
|
|
Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht.
|
|
|
|
Verwendung:
|
|
python3 check_health.py # alle unhealthy Container
|
|
python3 check_health.py paperless-ngx # gezielt einen Service pruefen
|
|
python3 check_health.py --summary # Gesamtstatus als Zusammenfassung
|
|
|
|
Pfad auf Host (via Komodo-Clone):
|
|
/mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
|
|
|
|
services.yaml wird relativ zum Script-Verzeichnis gesucht:
|
|
../services.yaml
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
from datetime import datetime
|
|
from pathlib import Path
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Konfiguration
|
|
# ---------------------------------------------------------------------------
|
|
|
|
SCRIPT_DIR = Path(__file__).parent.resolve()
|
|
SERVICES_YAML_PATH = SCRIPT_DIR.parent / "services.yaml"
|
|
|
|
# Fallback falls das Repo unter einem anderen Pfad liegt
|
|
SERVICES_YAML_FALLBACK = Path("/mnt/user/services/homelab/ops/hermes-agent/services.yaml")
|
|
|
|
# Dump-Warnschwelle in Stunden (aelter = Warnung)
|
|
DUMP_WARN_HOURS = 26
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Hilfsfunktionen
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def load_services():
|
|
"""Laedt services.yaml. Gibt (services_dict, meta_dict) zurueck."""
|
|
try:
|
|
import yaml
|
|
except ImportError:
|
|
# PyYAML nicht installiert — minimaler Fallback ueber pip
|
|
subprocess.run(
|
|
[sys.executable, "-m", "pip", "install", "pyyaml", "-q"],
|
|
check=True
|
|
)
|
|
import yaml
|
|
|
|
path = SERVICES_YAML_PATH if SERVICES_YAML_PATH.exists() else SERVICES_YAML_FALLBACK
|
|
if not path.exists():
|
|
raise FileNotFoundError(f"services.yaml nicht gefunden: {path}")
|
|
|
|
with open(path) as f:
|
|
data = yaml.safe_load(f)
|
|
|
|
return data.get("services", {}), data.get("meta", {})
|
|
|
|
|
|
def docker_inspect(container_name: str) -> dict:
|
|
"""
|
|
Gibt {'status': str, 'health': str} zurueck.
|
|
status: running | exited | restarting | dead | not_found | error
|
|
health: healthy | unhealthy | starting | none | unknown
|
|
"""
|
|
try:
|
|
result = subprocess.run(
|
|
[
|
|
"docker", "inspect",
|
|
"--format",
|
|
"{{.State.Status}}|||{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}",
|
|
container_name,
|
|
],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=10,
|
|
)
|
|
if result.returncode != 0:
|
|
return {"status": "not_found", "health": "unknown"}
|
|
|
|
parts = result.stdout.strip().split("|||")
|
|
return {
|
|
"status": parts[0].strip() if parts else "unknown",
|
|
"health": parts[1].strip() if len(parts) > 1 else "none",
|
|
}
|
|
except Exception as e:
|
|
return {"status": "error", "health": str(e)}
|
|
|
|
|
|
def is_healthy(inspect_result: dict) -> bool:
|
|
status = inspect_result.get("status", "")
|
|
health = inspect_result.get("health", "")
|
|
if status != "running":
|
|
return False
|
|
if health in ("unhealthy",):
|
|
return False
|
|
return True
|
|
|
|
|
|
def get_unhealthy_containers() -> list[str]:
|
|
"""Gibt Liste aller Container zurueck die unhealthy oder nicht running sind."""
|
|
try:
|
|
# unhealthy per healthcheck
|
|
r1 = subprocess.run(
|
|
["docker", "ps", "--filter", "health=unhealthy", "--format", "{{.Names}}"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
# exited/dead Container die eigentlich laufen sollten
|
|
r2 = subprocess.run(
|
|
["docker", "ps", "--filter", "status=exited", "--format", "{{.Names}}"],
|
|
capture_output=True, text=True, timeout=10,
|
|
)
|
|
names = set()
|
|
for raw in (r1.stdout, r2.stdout):
|
|
for name in raw.strip().split("\n"):
|
|
name = name.strip()
|
|
if name:
|
|
names.add(name)
|
|
return sorted(names)
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
def get_dump_info(dump_file: str | None, dump_base: str) -> dict | None:
|
|
"""Gibt Alter und Groesse des Dump-Files zurueck (oder None wenn nicht vorhanden)."""
|
|
if not dump_file:
|
|
return None
|
|
|
|
path = Path(dump_base) / dump_file
|
|
if not path.exists():
|
|
return {"file": dump_file, "exists": False, "age_hours": None, "size_mb": None}
|
|
|
|
stat = path.stat()
|
|
age_hours = round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1)
|
|
size_mb = round(stat.st_size / 1_048_576, 1)
|
|
|
|
return {
|
|
"file": dump_file,
|
|
"exists": True,
|
|
"age_hours": age_hours,
|
|
"size_mb": size_mb,
|
|
"warn": age_hours > DUMP_WARN_HOURS,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Report-Generierung
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def build_service_report(service_key: str, service: dict, all_services: dict, meta: dict) -> dict:
|
|
"""Erstellt einen vollstaendigen Report fuer einen einzelnen Service."""
|
|
dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
|
|
|
|
# Eigener Container-Status
|
|
own_inspect = docker_inspect(service["container_name"])
|
|
own_healthy = is_healthy(own_inspect)
|
|
|
|
# Abhaengigkeits-Check
|
|
dep_results = {}
|
|
for dep_key in service.get("dependencies", []):
|
|
dep = all_services.get(dep_key)
|
|
if not dep:
|
|
dep_results[dep_key] = {"status": "unknown_service", "health": "unknown", "healthy": False}
|
|
continue
|
|
insp = docker_inspect(dep["container_name"])
|
|
dep_results[dep_key] = {
|
|
**insp,
|
|
"healthy": is_healthy(insp),
|
|
"tier": dep.get("tier"),
|
|
"container_name": dep["container_name"],
|
|
}
|
|
|
|
unhealthy_deps = [k for k, v in dep_results.items() if not v["healthy"]]
|
|
|
|
# Dump-Info
|
|
dump_info = get_dump_info(service.get("dump_file"), dump_base)
|
|
|
|
return {
|
|
"service": service_key,
|
|
"description": service.get("description", ""),
|
|
"tier": service.get("tier"),
|
|
"url": service.get("url"),
|
|
"container": {
|
|
"name": service["container_name"],
|
|
**own_inspect,
|
|
"healthy": own_healthy,
|
|
},
|
|
"dependencies": dep_results,
|
|
"unhealthy_deps": unhealthy_deps,
|
|
"dump": dump_info,
|
|
"first_check": service.get("first_check", ""),
|
|
"notes": service.get("notes", ""),
|
|
"timestamp": datetime.now().isoformat(),
|
|
}
|
|
|
|
|
|
def build_summary_report(all_services: dict, meta: dict) -> dict:
|
|
"""Prueft alle Tier-1 und Tier-2 Dienste und gibt einen Gesamtstatus zurueck."""
|
|
results = {}
|
|
issues = []
|
|
|
|
for key, svc in all_services.items():
|
|
tier = svc.get("tier", 3)
|
|
if tier > 2:
|
|
continue # Tier-3 im Summary ueberspringen
|
|
|
|
insp = docker_inspect(svc["container_name"])
|
|
healthy = is_healthy(insp)
|
|
results[key] = {
|
|
"tier": tier,
|
|
"healthy": healthy,
|
|
"status": insp["status"],
|
|
"health": insp["health"],
|
|
}
|
|
if not healthy:
|
|
issues.append({"service": key, "tier": tier, **insp})
|
|
|
|
# Dump-Checks fuer alle Dienste mit dump_file
|
|
dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
|
|
stale_dumps = []
|
|
for key, svc in all_services.items():
|
|
info = get_dump_info(svc.get("dump_file"), dump_base)
|
|
if info and info.get("warn"):
|
|
stale_dumps.append({
|
|
"service": key,
|
|
"file": info["file"],
|
|
"age_hours": info["age_hours"],
|
|
})
|
|
|
|
return {
|
|
"mode": "summary",
|
|
"timestamp": datetime.now().isoformat(),
|
|
"services_checked": len(results),
|
|
"issues": issues,
|
|
"stale_dumps": stale_dumps,
|
|
"overall_healthy": len(issues) == 0 and len(stale_dumps) == 0,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Einstiegspunkt
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def main():
|
|
args = sys.argv[1:]
|
|
all_services, meta = load_services()
|
|
|
|
if "--summary" in args:
|
|
report = build_summary_report(all_services, meta)
|
|
print(json.dumps(report, indent=2, ensure_ascii=False))
|
|
return
|
|
|
|
# Expliziter Service-Key als Argument
|
|
if args and not args[0].startswith("--"):
|
|
service_key = args[0]
|
|
service = all_services.get(service_key)
|
|
if not service:
|
|
print(json.dumps({"error": f"Service '{service_key}' nicht in services.yaml gefunden."}))
|
|
sys.exit(1)
|
|
report = build_service_report(service_key, service, all_services, meta)
|
|
print(json.dumps(report, indent=2, ensure_ascii=False))
|
|
return
|
|
|
|
# Kein Argument: alle unhealthy Container automatisch finden
|
|
unhealthy_names = get_unhealthy_containers()
|
|
|
|
if not unhealthy_names:
|
|
print(json.dumps({"status": "all_healthy", "timestamp": datetime.now().isoformat()}))
|
|
return
|
|
|
|
reports = []
|
|
for container_name in unhealthy_names:
|
|
# Container-Name auf Service-Key mappen
|
|
service_key = None
|
|
service = None
|
|
for key, svc in all_services.items():
|
|
if svc["container_name"] == container_name:
|
|
service_key = key
|
|
service = svc
|
|
break
|
|
|
|
if not service:
|
|
reports.append({
|
|
"service": container_name,
|
|
"description": "Unbekannter Container (nicht in services.yaml)",
|
|
"tier": None,
|
|
"container": {"name": container_name, "status": "unhealthy", "health": "unknown", "healthy": False},
|
|
"dependencies": {},
|
|
"unhealthy_deps": [],
|
|
"dump": None,
|
|
"first_check": "Container nicht in services.yaml — manuell pruefen",
|
|
"notes": "services.yaml aktualisieren wenn dieser Container produktiv ist",
|
|
"timestamp": datetime.now().isoformat(),
|
|
})
|
|
continue
|
|
|
|
reports.append(build_service_report(service_key, service, all_services, meta))
|
|
|
|
print(json.dumps(reports, indent=2, ensure_ascii=False))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|