Files
homelab-infra/ops/hermes-agent/scripts/check_health.py
Micha 5cc0a4dadb update
update
2026-05-06 20:18:25 +02:00

290 lines
9.6 KiB
Python

#!/usr/bin/env python3
"""
check_health.py — Homelab Alert Enricher
=========================================
Laedt services.json, prueft Services via HTTP(S) und gibt einen
strukturierten JSON-Report aus. Hermes nutzt diesen Report fuer
angereicherte ntfy-Alerts.
Keine externen Abhaengigkeiten — nur Python-Standardbibliothek.
Kein Docker CLI, kein Root, kein pip.
Check-Strategie:
- Services MIT url: HTTP GET, 2xx/3xx = healthy, Timeout/4xx/5xx = unhealthy
- Services OHNE url: "internal" — kein externer Check moeglich
- Dump-Timestamps: werden gelesen falls /mnt/user/... erreichbar ist (optional)
Verwendung:
python3 check_health.py # alle Services pruefen (Tier 1+2)
python3 check_health.py paperless-ngx # gezielt einen Service pruefen
python3 check_health.py --summary # Gesamtstatus Tier 1+2
python3 check_health.py --all # alle Tiers inkl. Tier 3
Pfad auf der Hermes-VM:
/srv/hermes-workspace/homelab-infra/ops/hermes-agent/scripts/check_health.py
"""
import json
import ssl
import sys
import urllib.request
import urllib.error
from datetime import datetime
from pathlib import Path
# ---------------------------------------------------------------------------
# Konfiguration
# ---------------------------------------------------------------------------
SCRIPT_DIR = Path(__file__).parent.resolve()
SERVICES_JSON_PATH = SCRIPT_DIR.parent / "services.json"
SERVICES_JSON_FALLBACK = Path("/srv/hermes-workspace/homelab-infra/ops/hermes-agent/services.json")
# HTTP-Check Timeout in Sekunden
HTTP_TIMEOUT = 8
# Dump-Verzeichnis (optional — wird uebersprungen wenn nicht erreichbar)
DUMP_BASE_PATHS = [
Path("/mnt/user/backups/borg/dumps/latest"), # Unraid direkt
Path("/opt/dumps"), # gemounteter Fallback
]
# Dump-Warnschwelle
DUMP_WARN_HOURS = 26
# SSL-Verification (True = strikt, False = ignoriert selbstsignierte Zerts)
SSL_VERIFY = False
# ---------------------------------------------------------------------------
# Hilfsfunktionen
# ---------------------------------------------------------------------------
def load_services():
"""Laedt services.json ohne externe Abhaengigkeiten."""
path = SERVICES_JSON_PATH if SERVICES_JSON_PATH.exists() else SERVICES_JSON_FALLBACK
if not path.exists():
raise FileNotFoundError(
f"services.json nicht gefunden: {path}\n"
"Bitte 'git pull' in /srv/hermes-workspace/homelab-infra/ ausfuehren."
)
with open(path, encoding="utf-8") as f:
data = json.load(f)
return data.get("services", {}), data.get("meta", {})
def http_check(url: str) -> dict:
"""
Fuehrt einen HTTP GET gegen url aus.
Gibt {'reachable': bool, 'status_code': int|None, 'error': str|None} zurueck.
2xx und 3xx gelten als healthy. 401/403 auch (Service laeuft, Auth blockiert).
"""
ctx = ssl.create_default_context()
if not SSL_VERIFY:
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE
try:
req = urllib.request.Request(url, method="GET", headers={"User-Agent": "hermes-healthcheck/1.0"})
with urllib.request.urlopen(req, timeout=HTTP_TIMEOUT, context=ctx) as resp:
code = resp.status
healthy = code < 500
return {"reachable": healthy, "status_code": code, "error": None}
except urllib.error.HTTPError as e:
# 4xx = Service laeuft, aber Auth oder Not Found — trotzdem erreichbar
healthy = e.code < 500
return {"reachable": healthy, "status_code": e.code, "error": None}
except urllib.error.URLError as e:
return {"reachable": False, "status_code": None, "error": str(e.reason)}
except Exception as e:
return {"reachable": False, "status_code": None, "error": str(e)}
def check_service(service: dict) -> dict:
"""
Prueft einen einzelnen Service.
Gibt {'healthy': bool, 'method': str, 'detail': dict} zurueck.
"""
url = service.get("url")
if url:
result = http_check(url)
return {
"healthy": result["reachable"],
"method": "http",
"url": url,
"status_code": result["status_code"],
"error": result["error"],
}
else:
return {
"healthy": None, # None = unbekannt (intern, kein externer Check)
"method": "internal",
"url": None,
"status_code": None,
"error": "Kein externer Check — interner Service ohne URL",
}
def find_dump_base() -> Path | None:
"""Sucht das Dump-Verzeichnis in bekannten Pfaden."""
for p in DUMP_BASE_PATHS:
if p.exists():
return p
return None
def get_dump_info(dump_file: str | None, dump_base: Path | None) -> dict | None:
"""Liest Alter und Groesse einer Dump-Datei."""
if not dump_file or not dump_base:
return None
path = dump_base / dump_file
if not path.exists():
return {"file": dump_file, "exists": False, "age_hours": None, "size_mb": None, "warn": False}
stat = path.stat()
age_hours = round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1)
size_mb = round(stat.st_size / 1_048_576, 1)
return {
"file": dump_file,
"exists": True,
"age_hours": age_hours,
"size_mb": size_mb,
"warn": age_hours > DUMP_WARN_HOURS,
}
# ---------------------------------------------------------------------------
# Report-Generierung
# ---------------------------------------------------------------------------
def build_service_report(service_key: str, service: dict, all_services: dict) -> dict:
"""Vollstaendiger Report fuer einen einzelnen Service inkl. Abhaengigkeiten."""
dump_base = find_dump_base()
# Eigener Status
own = check_service(service)
# Abhaengigkeiten pruefen
dep_results = {}
for dep_key in service.get("dependencies", []):
dep = all_services.get(dep_key)
if not dep:
dep_results[dep_key] = {"healthy": None, "method": "unknown", "error": "Nicht in services.json"}
continue
dep_results[dep_key] = {
"tier": dep.get("tier"),
"container_name": dep.get("container_name"),
**check_service(dep),
}
# Unhealthy Deps: nur die die definitiv False sind (None = intern = ignorieren)
unhealthy_deps = [k for k, v in dep_results.items() if v.get("healthy") is False]
# Dump-Info
dump_info = get_dump_info(service.get("dump_file"), dump_base)
return {
"service": service_key,
"description": service.get("description", ""),
"tier": service.get("tier"),
"url": service.get("url"),
"status": own,
"dependencies": dep_results,
"unhealthy_deps": unhealthy_deps,
"dump": dump_info,
"first_check": service.get("first_check", ""),
"notes": service.get("notes", ""),
"timestamp": datetime.now().isoformat(),
}
def build_summary_report(all_services: dict, include_tier3: bool = False) -> dict:
"""Prueft alle Tier-1 und Tier-2 Services (optional auch Tier-3)."""
dump_base = find_dump_base()
issues = []
results = {}
for key, svc in all_services.items():
tier = svc.get("tier", 3)
if not include_tier3 and tier > 2:
continue
status = check_service(svc)
healthy = status.get("healthy")
results[key] = {
"tier": tier,
"method": status["method"],
"healthy": healthy,
"status_code": status.get("status_code"),
"error": status.get("error"),
}
# Nur echte Fehler als Issue zaehlen (None = intern, nicht pruefbar)
if healthy is False:
issues.append({
"service": key,
"tier": tier,
"url": svc.get("url"),
**status,
})
# Dump-Checks
stale_dumps = []
for key, svc in all_services.items():
info = get_dump_info(svc.get("dump_file"), dump_base)
if info and info.get("warn"):
stale_dumps.append({
"service": key,
"file": info["file"],
"age_hours": info["age_hours"],
})
dump_available = dump_base is not None
return {
"mode": "summary",
"timestamp": datetime.now().isoformat(),
"dump_base_found": dump_available,
"services_checked": len(results),
"issues": issues,
"stale_dumps": stale_dumps,
"overall_healthy": len(issues) == 0,
"note": "Interne Services ohne URL konnten nicht geprueft werden." if any(
v["method"] == "internal" for v in results.values()
) else "",
}
# ---------------------------------------------------------------------------
# Einstiegspunkt
# ---------------------------------------------------------------------------
def main():
args = sys.argv[1:]
all_services, _ = load_services()
include_all = "--all" in args
summary_mode = "--summary" in args
if summary_mode or not args or args[0].startswith("--"):
report = build_summary_report(all_services, include_tier3=include_all)
print(json.dumps(report, indent=2, ensure_ascii=False))
return
# Gezielter Service-Key
service_key = args[0]
service = all_services.get(service_key)
if not service:
print(json.dumps({"error": f"Service '{service_key}' nicht in services.json gefunden."}))
sys.exit(1)
report = build_service_report(service_key, service, all_services)
print(json.dumps(report, indent=2, ensure_ascii=False))
if __name__ == "__main__":
main()