hermes update
hermes update
This commit is contained in:
@@ -0,0 +1,312 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
check_health.py — Homelab Alert Enricher
|
||||
=========================================
|
||||
Laedt services.yaml, prueft Docker-Health aller bekannten Abhaengigkeiten,
|
||||
liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus.
|
||||
|
||||
Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht.
|
||||
|
||||
Verwendung:
|
||||
python3 check_health.py # alle unhealthy Container
|
||||
python3 check_health.py paperless-ngx # gezielt einen Service pruefen
|
||||
python3 check_health.py --summary # Gesamtstatus als Zusammenfassung
|
||||
|
||||
Pfad auf Host (via Komodo-Clone):
|
||||
/mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
|
||||
|
||||
services.yaml wird relativ zum Script-Verzeichnis gesucht:
|
||||
../services.yaml
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Konfiguration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent.resolve()
|
||||
SERVICES_YAML_PATH = SCRIPT_DIR.parent / "services.yaml"
|
||||
|
||||
# Fallback falls das Repo unter einem anderen Pfad liegt
|
||||
SERVICES_YAML_FALLBACK = Path("/mnt/user/services/homelab/ops/hermes-agent/services.yaml")
|
||||
|
||||
# Dump-Warnschwelle in Stunden (aelter = Warnung)
|
||||
DUMP_WARN_HOURS = 26
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hilfsfunktionen
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_services():
|
||||
"""Laedt services.yaml. Gibt (services_dict, meta_dict) zurueck."""
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
# PyYAML nicht installiert — minimaler Fallback ueber pip
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "pip", "install", "pyyaml", "-q"],
|
||||
check=True
|
||||
)
|
||||
import yaml
|
||||
|
||||
path = SERVICES_YAML_PATH if SERVICES_YAML_PATH.exists() else SERVICES_YAML_FALLBACK
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"services.yaml nicht gefunden: {path}")
|
||||
|
||||
with open(path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
return data.get("services", {}), data.get("meta", {})
|
||||
|
||||
|
||||
def docker_inspect(container_name: str) -> dict:
|
||||
"""
|
||||
Gibt {'status': str, 'health': str} zurueck.
|
||||
status: running | exited | restarting | dead | not_found | error
|
||||
health: healthy | unhealthy | starting | none | unknown
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"docker", "inspect",
|
||||
"--format",
|
||||
"{{.State.Status}}|||{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}",
|
||||
container_name,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return {"status": "not_found", "health": "unknown"}
|
||||
|
||||
parts = result.stdout.strip().split("|||")
|
||||
return {
|
||||
"status": parts[0].strip() if parts else "unknown",
|
||||
"health": parts[1].strip() if len(parts) > 1 else "none",
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "health": str(e)}
|
||||
|
||||
|
||||
def is_healthy(inspect_result: dict) -> bool:
|
||||
status = inspect_result.get("status", "")
|
||||
health = inspect_result.get("health", "")
|
||||
if status != "running":
|
||||
return False
|
||||
if health in ("unhealthy",):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def get_unhealthy_containers() -> list[str]:
|
||||
"""Gibt Liste aller Container zurueck die unhealthy oder nicht running sind."""
|
||||
try:
|
||||
# unhealthy per healthcheck
|
||||
r1 = subprocess.run(
|
||||
["docker", "ps", "--filter", "health=unhealthy", "--format", "{{.Names}}"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
# exited/dead Container die eigentlich laufen sollten
|
||||
r2 = subprocess.run(
|
||||
["docker", "ps", "--filter", "status=exited", "--format", "{{.Names}}"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
names = set()
|
||||
for raw in (r1.stdout, r2.stdout):
|
||||
for name in raw.strip().split("\n"):
|
||||
name = name.strip()
|
||||
if name:
|
||||
names.add(name)
|
||||
return sorted(names)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def get_dump_info(dump_file: str | None, dump_base: str) -> dict | None:
|
||||
"""Gibt Alter und Groesse des Dump-Files zurueck (oder None wenn nicht vorhanden)."""
|
||||
if not dump_file:
|
||||
return None
|
||||
|
||||
path = Path(dump_base) / dump_file
|
||||
if not path.exists():
|
||||
return {"file": dump_file, "exists": False, "age_hours": None, "size_mb": None}
|
||||
|
||||
stat = path.stat()
|
||||
age_hours = round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1)
|
||||
size_mb = round(stat.st_size / 1_048_576, 1)
|
||||
|
||||
return {
|
||||
"file": dump_file,
|
||||
"exists": True,
|
||||
"age_hours": age_hours,
|
||||
"size_mb": size_mb,
|
||||
"warn": age_hours > DUMP_WARN_HOURS,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Report-Generierung
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_service_report(service_key: str, service: dict, all_services: dict, meta: dict) -> dict:
|
||||
"""Erstellt einen vollstaendigen Report fuer einen einzelnen Service."""
|
||||
dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
|
||||
|
||||
# Eigener Container-Status
|
||||
own_inspect = docker_inspect(service["container_name"])
|
||||
own_healthy = is_healthy(own_inspect)
|
||||
|
||||
# Abhaengigkeits-Check
|
||||
dep_results = {}
|
||||
for dep_key in service.get("dependencies", []):
|
||||
dep = all_services.get(dep_key)
|
||||
if not dep:
|
||||
dep_results[dep_key] = {"status": "unknown_service", "health": "unknown", "healthy": False}
|
||||
continue
|
||||
insp = docker_inspect(dep["container_name"])
|
||||
dep_results[dep_key] = {
|
||||
**insp,
|
||||
"healthy": is_healthy(insp),
|
||||
"tier": dep.get("tier"),
|
||||
"container_name": dep["container_name"],
|
||||
}
|
||||
|
||||
unhealthy_deps = [k for k, v in dep_results.items() if not v["healthy"]]
|
||||
|
||||
# Dump-Info
|
||||
dump_info = get_dump_info(service.get("dump_file"), dump_base)
|
||||
|
||||
return {
|
||||
"service": service_key,
|
||||
"description": service.get("description", ""),
|
||||
"tier": service.get("tier"),
|
||||
"url": service.get("url"),
|
||||
"container": {
|
||||
"name": service["container_name"],
|
||||
**own_inspect,
|
||||
"healthy": own_healthy,
|
||||
},
|
||||
"dependencies": dep_results,
|
||||
"unhealthy_deps": unhealthy_deps,
|
||||
"dump": dump_info,
|
||||
"first_check": service.get("first_check", ""),
|
||||
"notes": service.get("notes", ""),
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
def build_summary_report(all_services: dict, meta: dict) -> dict:
|
||||
"""Prueft alle Tier-1 und Tier-2 Dienste und gibt einen Gesamtstatus zurueck."""
|
||||
results = {}
|
||||
issues = []
|
||||
|
||||
for key, svc in all_services.items():
|
||||
tier = svc.get("tier", 3)
|
||||
if tier > 2:
|
||||
continue # Tier-3 im Summary ueberspringen
|
||||
|
||||
insp = docker_inspect(svc["container_name"])
|
||||
healthy = is_healthy(insp)
|
||||
results[key] = {
|
||||
"tier": tier,
|
||||
"healthy": healthy,
|
||||
"status": insp["status"],
|
||||
"health": insp["health"],
|
||||
}
|
||||
if not healthy:
|
||||
issues.append({"service": key, "tier": tier, **insp})
|
||||
|
||||
# Dump-Checks fuer alle Dienste mit dump_file
|
||||
dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
|
||||
stale_dumps = []
|
||||
for key, svc in all_services.items():
|
||||
info = get_dump_info(svc.get("dump_file"), dump_base)
|
||||
if info and info.get("warn"):
|
||||
stale_dumps.append({
|
||||
"service": key,
|
||||
"file": info["file"],
|
||||
"age_hours": info["age_hours"],
|
||||
})
|
||||
|
||||
return {
|
||||
"mode": "summary",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"services_checked": len(results),
|
||||
"issues": issues,
|
||||
"stale_dumps": stale_dumps,
|
||||
"overall_healthy": len(issues) == 0 and len(stale_dumps) == 0,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Einstiegspunkt
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
args = sys.argv[1:]
|
||||
all_services, meta = load_services()
|
||||
|
||||
if "--summary" in args:
|
||||
report = build_summary_report(all_services, meta)
|
||||
print(json.dumps(report, indent=2, ensure_ascii=False))
|
||||
return
|
||||
|
||||
# Expliziter Service-Key als Argument
|
||||
if args and not args[0].startswith("--"):
|
||||
service_key = args[0]
|
||||
service = all_services.get(service_key)
|
||||
if not service:
|
||||
print(json.dumps({"error": f"Service '{service_key}' nicht in services.yaml gefunden."}))
|
||||
sys.exit(1)
|
||||
report = build_service_report(service_key, service, all_services, meta)
|
||||
print(json.dumps(report, indent=2, ensure_ascii=False))
|
||||
return
|
||||
|
||||
# Kein Argument: alle unhealthy Container automatisch finden
|
||||
unhealthy_names = get_unhealthy_containers()
|
||||
|
||||
if not unhealthy_names:
|
||||
print(json.dumps({"status": "all_healthy", "timestamp": datetime.now().isoformat()}))
|
||||
return
|
||||
|
||||
reports = []
|
||||
for container_name in unhealthy_names:
|
||||
# Container-Name auf Service-Key mappen
|
||||
service_key = None
|
||||
service = None
|
||||
for key, svc in all_services.items():
|
||||
if svc["container_name"] == container_name:
|
||||
service_key = key
|
||||
service = svc
|
||||
break
|
||||
|
||||
if not service:
|
||||
reports.append({
|
||||
"service": container_name,
|
||||
"description": "Unbekannter Container (nicht in services.yaml)",
|
||||
"tier": None,
|
||||
"container": {"name": container_name, "status": "unhealthy", "health": "unknown", "healthy": False},
|
||||
"dependencies": {},
|
||||
"unhealthy_deps": [],
|
||||
"dump": None,
|
||||
"first_check": "Container nicht in services.yaml — manuell pruefen",
|
||||
"notes": "services.yaml aktualisieren wenn dieser Container produktiv ist",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
})
|
||||
continue
|
||||
|
||||
reports.append(build_service_report(service_key, service, all_services, meta))
|
||||
|
||||
print(json.dumps(reports, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user