hermes update
hermes update
This commit is contained in:
@@ -0,0 +1,312 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
check_health.py — Homelab Alert Enricher
|
||||
=========================================
|
||||
Laedt services.yaml, prueft Docker-Health aller bekannten Abhaengigkeiten,
|
||||
liest Dump-Timestamps und gibt einen strukturierten JSON-Report aus.
|
||||
|
||||
Hermes liest diesen Report und baut daraus eine angereicherte ntfy-Nachricht.
|
||||
|
||||
Verwendung:
|
||||
python3 check_health.py # alle unhealthy Container
|
||||
python3 check_health.py paperless-ngx # gezielt einen Service pruefen
|
||||
python3 check_health.py --summary # Gesamtstatus als Zusammenfassung
|
||||
|
||||
Pfad auf Host (via Komodo-Clone):
|
||||
/mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
|
||||
|
||||
services.yaml wird relativ zum Script-Verzeichnis gesucht:
|
||||
../services.yaml
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Konfiguration
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SCRIPT_DIR = Path(__file__).parent.resolve()
|
||||
SERVICES_YAML_PATH = SCRIPT_DIR.parent / "services.yaml"
|
||||
|
||||
# Fallback falls das Repo unter einem anderen Pfad liegt
|
||||
SERVICES_YAML_FALLBACK = Path("/mnt/user/services/homelab/ops/hermes-agent/services.yaml")
|
||||
|
||||
# Dump-Warnschwelle in Stunden (aelter = Warnung)
|
||||
DUMP_WARN_HOURS = 26
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hilfsfunktionen
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def load_services():
|
||||
"""Laedt services.yaml. Gibt (services_dict, meta_dict) zurueck."""
|
||||
try:
|
||||
import yaml
|
||||
except ImportError:
|
||||
# PyYAML nicht installiert — minimaler Fallback ueber pip
|
||||
subprocess.run(
|
||||
[sys.executable, "-m", "pip", "install", "pyyaml", "-q"],
|
||||
check=True
|
||||
)
|
||||
import yaml
|
||||
|
||||
path = SERVICES_YAML_PATH if SERVICES_YAML_PATH.exists() else SERVICES_YAML_FALLBACK
|
||||
if not path.exists():
|
||||
raise FileNotFoundError(f"services.yaml nicht gefunden: {path}")
|
||||
|
||||
with open(path) as f:
|
||||
data = yaml.safe_load(f)
|
||||
|
||||
return data.get("services", {}), data.get("meta", {})
|
||||
|
||||
|
||||
def docker_inspect(container_name: str) -> dict:
|
||||
"""
|
||||
Gibt {'status': str, 'health': str} zurueck.
|
||||
status: running | exited | restarting | dead | not_found | error
|
||||
health: healthy | unhealthy | starting | none | unknown
|
||||
"""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[
|
||||
"docker", "inspect",
|
||||
"--format",
|
||||
"{{.State.Status}}|||{{if .State.Health}}{{.State.Health.Status}}{{else}}none{{end}}",
|
||||
container_name,
|
||||
],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
return {"status": "not_found", "health": "unknown"}
|
||||
|
||||
parts = result.stdout.strip().split("|||")
|
||||
return {
|
||||
"status": parts[0].strip() if parts else "unknown",
|
||||
"health": parts[1].strip() if len(parts) > 1 else "none",
|
||||
}
|
||||
except Exception as e:
|
||||
return {"status": "error", "health": str(e)}
|
||||
|
||||
|
||||
def is_healthy(inspect_result: dict) -> bool:
|
||||
status = inspect_result.get("status", "")
|
||||
health = inspect_result.get("health", "")
|
||||
if status != "running":
|
||||
return False
|
||||
if health in ("unhealthy",):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def get_unhealthy_containers() -> list[str]:
|
||||
"""Gibt Liste aller Container zurueck die unhealthy oder nicht running sind."""
|
||||
try:
|
||||
# unhealthy per healthcheck
|
||||
r1 = subprocess.run(
|
||||
["docker", "ps", "--filter", "health=unhealthy", "--format", "{{.Names}}"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
# exited/dead Container die eigentlich laufen sollten
|
||||
r2 = subprocess.run(
|
||||
["docker", "ps", "--filter", "status=exited", "--format", "{{.Names}}"],
|
||||
capture_output=True, text=True, timeout=10,
|
||||
)
|
||||
names = set()
|
||||
for raw in (r1.stdout, r2.stdout):
|
||||
for name in raw.strip().split("\n"):
|
||||
name = name.strip()
|
||||
if name:
|
||||
names.add(name)
|
||||
return sorted(names)
|
||||
except Exception:
|
||||
return []
|
||||
|
||||
|
||||
def get_dump_info(dump_file: str | None, dump_base: str) -> dict | None:
|
||||
"""Gibt Alter und Groesse des Dump-Files zurueck (oder None wenn nicht vorhanden)."""
|
||||
if not dump_file:
|
||||
return None
|
||||
|
||||
path = Path(dump_base) / dump_file
|
||||
if not path.exists():
|
||||
return {"file": dump_file, "exists": False, "age_hours": None, "size_mb": None}
|
||||
|
||||
stat = path.stat()
|
||||
age_hours = round((datetime.now().timestamp() - stat.st_mtime) / 3600, 1)
|
||||
size_mb = round(stat.st_size / 1_048_576, 1)
|
||||
|
||||
return {
|
||||
"file": dump_file,
|
||||
"exists": True,
|
||||
"age_hours": age_hours,
|
||||
"size_mb": size_mb,
|
||||
"warn": age_hours > DUMP_WARN_HOURS,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Report-Generierung
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def build_service_report(service_key: str, service: dict, all_services: dict, meta: dict) -> dict:
|
||||
"""Erstellt einen vollstaendigen Report fuer einen einzelnen Service."""
|
||||
dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
|
||||
|
||||
# Eigener Container-Status
|
||||
own_inspect = docker_inspect(service["container_name"])
|
||||
own_healthy = is_healthy(own_inspect)
|
||||
|
||||
# Abhaengigkeits-Check
|
||||
dep_results = {}
|
||||
for dep_key in service.get("dependencies", []):
|
||||
dep = all_services.get(dep_key)
|
||||
if not dep:
|
||||
dep_results[dep_key] = {"status": "unknown_service", "health": "unknown", "healthy": False}
|
||||
continue
|
||||
insp = docker_inspect(dep["container_name"])
|
||||
dep_results[dep_key] = {
|
||||
**insp,
|
||||
"healthy": is_healthy(insp),
|
||||
"tier": dep.get("tier"),
|
||||
"container_name": dep["container_name"],
|
||||
}
|
||||
|
||||
unhealthy_deps = [k for k, v in dep_results.items() if not v["healthy"]]
|
||||
|
||||
# Dump-Info
|
||||
dump_info = get_dump_info(service.get("dump_file"), dump_base)
|
||||
|
||||
return {
|
||||
"service": service_key,
|
||||
"description": service.get("description", ""),
|
||||
"tier": service.get("tier"),
|
||||
"url": service.get("url"),
|
||||
"container": {
|
||||
"name": service["container_name"],
|
||||
**own_inspect,
|
||||
"healthy": own_healthy,
|
||||
},
|
||||
"dependencies": dep_results,
|
||||
"unhealthy_deps": unhealthy_deps,
|
||||
"dump": dump_info,
|
||||
"first_check": service.get("first_check", ""),
|
||||
"notes": service.get("notes", ""),
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
}
|
||||
|
||||
|
||||
def build_summary_report(all_services: dict, meta: dict) -> dict:
|
||||
"""Prueft alle Tier-1 und Tier-2 Dienste und gibt einen Gesamtstatus zurueck."""
|
||||
results = {}
|
||||
issues = []
|
||||
|
||||
for key, svc in all_services.items():
|
||||
tier = svc.get("tier", 3)
|
||||
if tier > 2:
|
||||
continue # Tier-3 im Summary ueberspringen
|
||||
|
||||
insp = docker_inspect(svc["container_name"])
|
||||
healthy = is_healthy(insp)
|
||||
results[key] = {
|
||||
"tier": tier,
|
||||
"healthy": healthy,
|
||||
"status": insp["status"],
|
||||
"health": insp["health"],
|
||||
}
|
||||
if not healthy:
|
||||
issues.append({"service": key, "tier": tier, **insp})
|
||||
|
||||
# Dump-Checks fuer alle Dienste mit dump_file
|
||||
dump_base = meta.get("dump_base", "/mnt/user/backups/borg/dumps/latest")
|
||||
stale_dumps = []
|
||||
for key, svc in all_services.items():
|
||||
info = get_dump_info(svc.get("dump_file"), dump_base)
|
||||
if info and info.get("warn"):
|
||||
stale_dumps.append({
|
||||
"service": key,
|
||||
"file": info["file"],
|
||||
"age_hours": info["age_hours"],
|
||||
})
|
||||
|
||||
return {
|
||||
"mode": "summary",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
"services_checked": len(results),
|
||||
"issues": issues,
|
||||
"stale_dumps": stale_dumps,
|
||||
"overall_healthy": len(issues) == 0 and len(stale_dumps) == 0,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Einstiegspunkt
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
args = sys.argv[1:]
|
||||
all_services, meta = load_services()
|
||||
|
||||
if "--summary" in args:
|
||||
report = build_summary_report(all_services, meta)
|
||||
print(json.dumps(report, indent=2, ensure_ascii=False))
|
||||
return
|
||||
|
||||
# Expliziter Service-Key als Argument
|
||||
if args and not args[0].startswith("--"):
|
||||
service_key = args[0]
|
||||
service = all_services.get(service_key)
|
||||
if not service:
|
||||
print(json.dumps({"error": f"Service '{service_key}' nicht in services.yaml gefunden."}))
|
||||
sys.exit(1)
|
||||
report = build_service_report(service_key, service, all_services, meta)
|
||||
print(json.dumps(report, indent=2, ensure_ascii=False))
|
||||
return
|
||||
|
||||
# Kein Argument: alle unhealthy Container automatisch finden
|
||||
unhealthy_names = get_unhealthy_containers()
|
||||
|
||||
if not unhealthy_names:
|
||||
print(json.dumps({"status": "all_healthy", "timestamp": datetime.now().isoformat()}))
|
||||
return
|
||||
|
||||
reports = []
|
||||
for container_name in unhealthy_names:
|
||||
# Container-Name auf Service-Key mappen
|
||||
service_key = None
|
||||
service = None
|
||||
for key, svc in all_services.items():
|
||||
if svc["container_name"] == container_name:
|
||||
service_key = key
|
||||
service = svc
|
||||
break
|
||||
|
||||
if not service:
|
||||
reports.append({
|
||||
"service": container_name,
|
||||
"description": "Unbekannter Container (nicht in services.yaml)",
|
||||
"tier": None,
|
||||
"container": {"name": container_name, "status": "unhealthy", "health": "unknown", "healthy": False},
|
||||
"dependencies": {},
|
||||
"unhealthy_deps": [],
|
||||
"dump": None,
|
||||
"first_check": "Container nicht in services.yaml — manuell pruefen",
|
||||
"notes": "services.yaml aktualisieren wenn dieser Container produktiv ist",
|
||||
"timestamp": datetime.now().isoformat(),
|
||||
})
|
||||
continue
|
||||
|
||||
reports.append(build_service_report(service_key, service, all_services, meta))
|
||||
|
||||
print(json.dumps(reports, indent=2, ensure_ascii=False))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@@ -0,0 +1,592 @@
|
||||
# services.yaml — Maschinenlesbare Wissensbasis fuer Hermes Alert Enrichment
|
||||
#
|
||||
# Abgeleitet aus docs/SERVICE_CATALOG.md
|
||||
# Stand: 2026-05-06
|
||||
#
|
||||
# Zweck: Hermes laedt diese Datei beim Alert-Anreichern, um Abhaengigkeiten,
|
||||
# Dump-Zeitstempel und den ersten Diagnoseschritt nachzuschlagen.
|
||||
#
|
||||
# Felder:
|
||||
# description - Kurzbeschreibung des Dienstes
|
||||
# tier - Kritikalitaet: 1=Control Plane, 2=User Apps, 3=Ops/Tools
|
||||
# category - core | security | infra | app | ops
|
||||
# container_name - exakter Docker-Containername (fuer docker inspect)
|
||||
# dependencies - Liste direkter Laufzeit-Abhaengigkeiten (andere Service-Keys)
|
||||
# url - oeffentliche URL (null = intern/LAN only)
|
||||
# dump_file - Dateiname in /mnt/user/backups/borg/dumps/latest/ (null = kein Dump)
|
||||
# data_paths - kritische Datenpfade auf dem Host
|
||||
# first_check - erster Diagnoseschritt bei Ausfall (Freitext fuer Hermes)
|
||||
# notes - betriebliche Hinweise und dokumentierte Ausnahmen
|
||||
|
||||
meta:
|
||||
dump_base: /mnt/user/backups/borg/dumps/latest
|
||||
appdata_base: /mnt/user/appdata
|
||||
secrets_path: /mnt/user/appdata/secrets
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TIER 1 — Control Plane (Ausfall blockiert alles darunter)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
services:
|
||||
|
||||
traefik:
|
||||
description: Zentraler Reverse Proxy, TLS, Docker-Label-Routing
|
||||
tier: 1
|
||||
category: core
|
||||
container_name: traefik
|
||||
dependencies: []
|
||||
url: https://traefik.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/traefik/dynamic
|
||||
- /mnt/user/appdata/traefik/letsencrypt
|
||||
first_check: "Host-Ports 80/443 erreichbar? dynamic/ korrekt auf Host synchronisiert?"
|
||||
notes: "dynamic configs werden NICHT automatisch von Komodo deployed — manueller Host-Sync noetig"
|
||||
|
||||
adguard:
|
||||
description: DNS-Server / LAN DNS
|
||||
tier: 1
|
||||
category: core
|
||||
container_name: adguard
|
||||
dependencies:
|
||||
- unbound
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/adguard/conf
|
||||
- /mnt/user/appdata/adguard/work
|
||||
first_check: "Port 53 erreichbar? Unbound healthy? dns_net Konnektivitaet?"
|
||||
notes: "Ports 53 und 8082 dokumentierte Host-Port-Ausnahmen"
|
||||
|
||||
unbound:
|
||||
description: Upstream DNS Resolver fuer AdGuard
|
||||
tier: 1
|
||||
category: core
|
||||
container_name: unbound
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/unbound/config
|
||||
first_check: "dns_net Konnektivitaet pruefen; Container-Logs auf Fehler pruefen"
|
||||
notes: "rebuildbar; isoliert in dns_net"
|
||||
|
||||
tailscale:
|
||||
description: VPN / Remote-Zugang
|
||||
tier: 1
|
||||
category: core
|
||||
container_name: tailscale
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/tailscale
|
||||
first_check: "Tailscale Status auf Host pruefen; State-Datei fuer Key-Renewal vorhanden?"
|
||||
notes: "network_mode: host; NET_ADMIN, NET_RAW, /dev/net/tun — dokumentierte VPN-Ausnahmen"
|
||||
|
||||
gitea:
|
||||
description: Git-Server — operative Quelle der Wahrheit fuer GitOps
|
||||
tier: 1
|
||||
category: core
|
||||
container_name: gitea
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://git.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/services/gitea/data
|
||||
first_check: "HTTPS erreichbar? SQLite in /data intakt? SSH-Port 222 erreichbar?"
|
||||
notes: "SQLite in /data — kein separater Dump; ohne externen Mirror im DR kritisch"
|
||||
|
||||
authelia:
|
||||
description: ForwardAuth — zentrale Authentifizierung fuer Admin-UIs
|
||||
tier: 1
|
||||
category: security
|
||||
container_name: authelia
|
||||
dependencies:
|
||||
- postgresql17
|
||||
- traefik
|
||||
url: https://auth.kaleschke.info
|
||||
dump_file: postgresql17-authelia.dump
|
||||
data_paths:
|
||||
- /mnt/user/appdata/authelia/config
|
||||
first_check: "PostgreSQL healthy? SMTP via GMX erreichbar? Host-Config aktuell (Repo-Baseline != Host)?"
|
||||
notes: "kein Redis-Session-Backend; SMTP-Notifier GMX; Repo-Baseline muss manuell in Host-Config gemerged werden"
|
||||
|
||||
vaultwarden:
|
||||
description: Passwort-Tresor
|
||||
tier: 1
|
||||
category: security
|
||||
container_name: vaultwarden
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://vault.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/vaultwarden
|
||||
first_check: "HTTPS erreichbar? Appdata-Volume intakt?"
|
||||
notes: "ADMIN_TOKEN_FILE; keine direkten Host-Ports"
|
||||
|
||||
postgresql17:
|
||||
description: Shared PostgreSQL Cluster (Authelia, Paperless, Mail-Archiver, Mealie, Komodo indirekt)
|
||||
tier: 1
|
||||
category: infra
|
||||
container_name: postgresql17
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/postgresql17
|
||||
first_check: "backend_net Konnektivitaet? Disk-Space auf /mnt/user/appdata? pg_isready im Container?"
|
||||
notes: "Dumps per Dienst unter dumps/latest; raw DB nicht primaerer Restore-Weg"
|
||||
|
||||
komodo-core:
|
||||
description: GitOps UI / API / Stack-Manager
|
||||
tier: 1
|
||||
category: ops
|
||||
container_name: komodo-core
|
||||
dependencies:
|
||||
- komodo-mongo
|
||||
- gitea
|
||||
- traefik
|
||||
url: https://komodo.kaleschke.info
|
||||
dump_file: komodo-mongo.archive.gz
|
||||
data_paths:
|
||||
- /mnt/user/appdata/komodo/core
|
||||
first_check: "MongoDB healthy? Gitea erreichbar? komodo_net Konnektivitaet?"
|
||||
notes: "keine pauschale Authelia-ForwardAuth; Gitea DNS override konfiguriert"
|
||||
|
||||
komodo-mongo:
|
||||
description: Komodo Datenbank (MongoDB)
|
||||
tier: 1
|
||||
category: infra
|
||||
container_name: komodo-mongo
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: komodo-mongo.archive.gz
|
||||
data_paths:
|
||||
- /mnt/user/appdata/komodo/mongo
|
||||
first_check: "komodo_net Konnektivitaet? Disk-Space? mongosh ping?"
|
||||
notes: "Dump-Integritaet nach Major-Upgrades pruefen"
|
||||
|
||||
komodo-periphery:
|
||||
description: Komodo Host-Agent (Stack-Deployments)
|
||||
tier: 1
|
||||
category: ops
|
||||
container_name: komodo-periphery
|
||||
dependencies:
|
||||
- komodo-core
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/komodo/periphery
|
||||
first_check: "Docker-Socket lesbar? /mnt/user/services gemountet? komodo_net Verbindung zu Core?"
|
||||
notes: "Docker-Socket-Ausnahme dokumentiert; /mnt/user/services Mount fuer Stack-Workspaces"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TIER 2 — User Apps
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
redis:
|
||||
description: Shared Redis Cache (Paperless, weitere)
|
||||
tier: 2
|
||||
category: infra
|
||||
container_name: redis
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/redis
|
||||
first_check: "backend_net Konnektivitaet? redis-cli ping erreichbar?"
|
||||
notes: "transiente Daten; bewusst nicht Backup-kritisch"
|
||||
|
||||
paperless-ngx:
|
||||
description: Dokumentenmanagement
|
||||
tier: 2
|
||||
category: app
|
||||
container_name: paperless-ngx
|
||||
dependencies:
|
||||
- postgresql17
|
||||
- redis
|
||||
- traefik
|
||||
url: https://paperless.kaleschke.info
|
||||
dump_file: postgresql17-paperless.dump
|
||||
data_paths:
|
||||
- /mnt/user/appdata/paperless-ngx/data
|
||||
- /mnt/user/documents/paperless
|
||||
- /mnt/user/documents/scans_inbox
|
||||
first_check: "Redis healthy? PostgreSQL healthy? backend_net Konnektivitaet?"
|
||||
notes: "DB/Redis Secrets als Stack ENV (keine _FILE Variante)"
|
||||
|
||||
paperless-gpt:
|
||||
description: KI-Ergaenzung fuer Paperless (OCR/Tagging via LLM)
|
||||
tier: 2
|
||||
category: app
|
||||
container_name: paperless-gpt
|
||||
dependencies:
|
||||
- paperless-ngx
|
||||
- traefik
|
||||
url: https://paperless-gpt.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/paperless-gpt/data
|
||||
- /mnt/user/appdata/paperless-gpt/prompts
|
||||
first_check: "Paperless API erreichbar? LLM/Ollama erreichbar? API Token gesetzt?"
|
||||
notes: "API Token als Stack ENV; abhaengig von laufendem Paperless"
|
||||
|
||||
immich_server:
|
||||
description: Foto-/Video-App
|
||||
tier: 2
|
||||
category: app
|
||||
container_name: immich_server
|
||||
dependencies:
|
||||
- immich_postgres
|
||||
- immich_redis
|
||||
- immich_machine_learning
|
||||
- traefik
|
||||
url: https://immich.kaleschke.info
|
||||
dump_file: immich.dump
|
||||
data_paths:
|
||||
- /mnt/user/photos/immich
|
||||
- /mnt/user/photos/family_archive
|
||||
first_check: "immich_postgres healthy? immich_redis healthy? ML-Container healthy? immich_default Netz?"
|
||||
notes: "native App-Auth; externes Fotoarchiv gemountet"
|
||||
|
||||
immich_postgres:
|
||||
description: Immich-Datenbank
|
||||
tier: 2
|
||||
category: infra
|
||||
container_name: immich_postgres
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: immich.dump
|
||||
data_paths:
|
||||
- /mnt/user/appdata/immich_postgres
|
||||
first_check: "immich_default Netz? Disk-Space? pg_isready?"
|
||||
notes: "nie ins frontend_net; immich_default Netz isoliert"
|
||||
|
||||
immich_redis:
|
||||
description: Immich Cache
|
||||
tier: 2
|
||||
category: infra
|
||||
container_name: immich_redis
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths: []
|
||||
first_check: "immich_default Netz? redis-cli ping?"
|
||||
notes: "rebuildbar; anonymes Volume — named volume als offenes TODO"
|
||||
|
||||
immich_machine_learning:
|
||||
description: Immich ML (Gesichtserkennung, Suche)
|
||||
tier: 2
|
||||
category: infra
|
||||
container_name: immich_machine_learning
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- model-cache
|
||||
first_check: "immich_default Netz? model-cache Volume vorhanden?"
|
||||
notes: "rebuildbar; intern-only"
|
||||
|
||||
mealie:
|
||||
description: Rezeptverwaltung
|
||||
tier: 2
|
||||
category: app
|
||||
container_name: mealie
|
||||
dependencies:
|
||||
- mealie-postgres
|
||||
- traefik
|
||||
url: https://mealie.kaleschke.info
|
||||
dump_file: mealie.dump
|
||||
data_paths:
|
||||
- /mnt/user/appdata/mealie/data
|
||||
first_check: "mealie-postgres healthy? mealie_internal Netz erreichbar?"
|
||||
notes: "App + DB in internem Netz getrennt (mealie_internal)"
|
||||
|
||||
mealie-postgres:
|
||||
description: Mealie-Datenbank
|
||||
tier: 2
|
||||
category: infra
|
||||
container_name: mealie-postgres
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: mealie.dump
|
||||
data_paths:
|
||||
- /mnt/user/appdata/mealie/postgres
|
||||
first_check: "mealie_internal Netz? Disk-Space?"
|
||||
notes: "interne DB; mealie_internal Netz"
|
||||
|
||||
mail-archiver:
|
||||
description: Mail-Archivierung (IMAP)
|
||||
tier: 2
|
||||
category: app
|
||||
container_name: mail-archiver
|
||||
dependencies:
|
||||
- postgresql17
|
||||
- authelia
|
||||
- traefik
|
||||
url: https://mail.kaleschke.info
|
||||
dump_file: postgresql17-mailarchiver.dump
|
||||
data_paths:
|
||||
- /mnt/user/appdata/mailarchiver/data-protection-keys
|
||||
first_check: "PostgreSQL healthy? Internet-/IMAP-Zugang? Authelia healthy?"
|
||||
notes: "Hybrid: frontend_net fuer IMAP/Internet, backend_net fuer DB"
|
||||
|
||||
nextcloud:
|
||||
description: Datei-/Cloud-Dienst
|
||||
tier: 2
|
||||
category: app
|
||||
container_name: nextcloud
|
||||
dependencies:
|
||||
- nextcloud-postgres
|
||||
- nextcloud-redis
|
||||
- traefik
|
||||
url: https://cloud.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/nextcloud/html
|
||||
- /mnt/user/documents/nextcloud-data
|
||||
first_check: "nextcloud-postgres healthy? nextcloud-redis healthy? nextcloud_internal Netz?"
|
||||
notes: "native App-Auth (kein zentrales ForwardAuth); WebDAV/CardDAV beachten"
|
||||
|
||||
nextcloud-postgres:
|
||||
description: Nextcloud-Datenbank
|
||||
tier: 2
|
||||
category: infra
|
||||
container_name: nextcloud-postgres
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/nextcloud/postgres
|
||||
first_check: "nextcloud_internal Netz? Disk-Space?"
|
||||
notes: "interne DB"
|
||||
|
||||
nextcloud-redis:
|
||||
description: Nextcloud Cache / Locking
|
||||
tier: 2
|
||||
category: infra
|
||||
container_name: nextcloud-redis
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/nextcloud/redis
|
||||
first_check: "nextcloud_internal Netz? redis-cli ping?"
|
||||
notes: "rebuildbar"
|
||||
|
||||
ntfy:
|
||||
description: Push-Benachrichtigungen (Alert-Backbone)
|
||||
tier: 2
|
||||
category: app
|
||||
container_name: ntfy
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://ntfy.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/ntfy
|
||||
first_check: "HTTPS erreichbar? NTFY_BEHIND_PROXY=true gesetzt? Traefik healthy?"
|
||||
notes: "KRITISCH: Ausfall bedeutet keine anderen Alerts ankommen; Monitoring/Borg-Benachrichtigungen"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# TIER 3 — Ops / Tools (Ausfall schmerzt, blockiert nichts Kritisches)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
homepage:
|
||||
description: Start-Dashboard
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: homepage
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://home.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/homepage
|
||||
first_check: "Traefik erreichbar? Docker-Socket read-only lesbar? API-Tokens gueltig?"
|
||||
notes: "Docker socket read-only; viele API Tokens in Config"
|
||||
|
||||
uptime-kuma:
|
||||
description: Monitoring / Uptime Checks
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: UptimeKuma
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://uptime.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/uptime-kuma
|
||||
first_check: "Datenbank-Volume intakt? Traefik erreichbar?"
|
||||
notes: "Monitore nach Restore manuell pruefen"
|
||||
|
||||
grafana:
|
||||
description: Metrik-Dashboard
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: grafana
|
||||
dependencies:
|
||||
- influxdb3-core
|
||||
- traefik
|
||||
url: https://grafana.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/grafana
|
||||
first_check: "influxdb3-core healthy? Datasource-Token in Secret gesetzt? Provisioning-Konfig vorhanden?"
|
||||
notes: "laeuft als user 0 wegen Host-Appdata-Permissions (dokumentiert); Datasource wird provisioniert"
|
||||
|
||||
influxdb3-core:
|
||||
description: Zeitreihen- / Metrikdaten fuer Grafana und Home Assistant
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: influxdb3-core
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/influxdb3/data
|
||||
- /mnt/user/appdata/influxdb3/plugins
|
||||
first_check: "LAN-Port 8181 erreichbar? 401 ohne Token = OK (erwartet). Disk-Space?"
|
||||
notes: "LAN-only Host-Port 8181; kein frontend_net; laeuft als user 0"
|
||||
|
||||
scrutiny:
|
||||
description: Laufwerks- / SMART-Monitoring
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: scrutiny
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://scrutiny.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/scrutiny/config
|
||||
- /mnt/user/appdata/scrutiny/influxdb
|
||||
first_check: "Device-Mounts vorhanden? privileged=true gesetzt? Traefik erreichbar?"
|
||||
notes: "privileged: true dokumentierte Ausnahme"
|
||||
|
||||
glances:
|
||||
description: System- / Container-Monitoring
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: glances
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://glances.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths: []
|
||||
first_check: "Docker-Socket lesbar? rootfs gemountet? Traefik erreichbar?"
|
||||
notes: "rebuildbar; Docker-Socket und rootfs Mounts"
|
||||
|
||||
borg-ui:
|
||||
description: Borg Backup- / Restore UI
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: borg-ui
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://borg.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/borg-ui/data
|
||||
- /mnt/user/backups/borg/dumps
|
||||
first_check: "Borg-Repo-Credentials vorhanden? Backup-Mounts erreichbar? Traefik healthy?"
|
||||
notes: "breite Mounts bewusst dokumentiert; /local/secrets im DR-Scope"
|
||||
|
||||
backrest:
|
||||
description: Backup-Admin-Dienst (Legacy-Backup-Ebene)
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: backrest
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://backrest.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/backrest
|
||||
first_check: "Repo/SSH-Mounts erreichbar? Traefik healthy?"
|
||||
notes: "breite Mounts bewusst dokumentiert"
|
||||
|
||||
hermes-gateway:
|
||||
description: Hermes Agent Gateway / AI Ops Assistant
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: hermes-gateway
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/hermes-agent/data
|
||||
first_check: "hermes_net:8642/health erreichbar? SSH-Key gemountet? LLM-Provider erreichbar?"
|
||||
notes: "kein Docker-Socket; SSH terminal backend; echte .env auf Host-Appdata"
|
||||
|
||||
ddns-updater:
|
||||
description: Cloudflare / DDNS Aktualisierung
|
||||
tier: 3
|
||||
category: infra
|
||||
container_name: ddns-updater
|
||||
dependencies: []
|
||||
url: null
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/ddns-updater
|
||||
first_check: "Internetzugang? Cloudflare API erreichbar? Config vorhanden?"
|
||||
notes: "bewusst in frontend_net weil backend_net internal ist"
|
||||
|
||||
code-server:
|
||||
description: Web-Editor / Operations Workspace
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: code-server
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://code.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/code-server
|
||||
- /mnt/user/services/dev
|
||||
first_check: "Traefik erreichbar? PASSWORD_FILE lesbar?"
|
||||
notes: "PASSWORD_FILE; Workspaces bei Restore beachten"
|
||||
|
||||
filebrowser:
|
||||
description: Datei-Browser fuer Appdata
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: filebrowser
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://files.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/filebrowser
|
||||
first_check: "Appdata-Mounts erreichbar? Traefik healthy?"
|
||||
notes: "breiter /mnt/user/appdata Mount; Einschraenkung langfristig als TODO"
|
||||
|
||||
speedtest-tracker:
|
||||
description: Speedtest-Monitoring
|
||||
tier: 3
|
||||
category: ops
|
||||
container_name: speedtest-tracker
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://speedtest.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths:
|
||||
- /mnt/user/appdata/speedtest-tracker/config
|
||||
first_check: "APP_KEY gesetzt? Internetzugang fuer Speedtest vorhanden?"
|
||||
notes: "APP_KEY, ADMIN_PASSWORD als Stack ENV"
|
||||
|
||||
bentopdf:
|
||||
description: PDF-Tooling
|
||||
tier: 3
|
||||
category: app
|
||||
container_name: bentopdf
|
||||
dependencies:
|
||||
- traefik
|
||||
url: https://pdf.kaleschke.info
|
||||
dump_file: null
|
||||
data_paths: []
|
||||
first_check: "COOP/COEP Middleware gesetzt? Traefik healthy?"
|
||||
notes: "rebuildbar; keine kritische Persistenz; Live-Status pruefen"
|
||||
@@ -0,0 +1,153 @@
|
||||
# Skill: homelab-ops-monitor
|
||||
|
||||
## Zweck
|
||||
|
||||
Dieser Skill macht Hermes zum kontextuellen Ops-Assistenten fuer das Kallilabcore-Homelab.
|
||||
Wenn ein Container unhealthy wird, liefert dieser Skill keine rohe Fehlermeldung,
|
||||
sondern einen angereicherten Alert: Was ist kaputt, welche Abhaengigkeiten sind betroffen,
|
||||
wie alt ist der letzte Backup-Dump, und was ist der erste konkrete Diagnoseschritt.
|
||||
|
||||
---
|
||||
|
||||
## Wann aktivieren
|
||||
|
||||
- Wenn ein Container unhealthy gemeldet wird (manuell oder via Cronjob)
|
||||
- Wenn der Benutzer fragt: "Was ist kaputt?" / "Was ist mit [Service]?"
|
||||
- Wenn ein proaktiver Health-Check ausgefuehrt werden soll
|
||||
- Wenn ein ntfy-Alert angereichert werden soll bevor er gesendet wird
|
||||
|
||||
---
|
||||
|
||||
## Kernprinzipien
|
||||
|
||||
1. **Immer check_health.py ausfuehren** — nie raten, immer messen.
|
||||
2. **Kontext aus services.yaml** — Abhaengigkeiten und Dump-Info sind dort definiert.
|
||||
3. **ntfy-Alert nur wenn wirklich etwas unhealthy ist** — kein Alert-Spam.
|
||||
4. **Tier 1 = urgent, Tier 2 = high, Tier 3 = default** — ntfy Priority entsprechend setzen.
|
||||
5. **Kein Schreiben, kein Neustart** — dieser Skill diagnostiziert, handelt nicht.
|
||||
|
||||
---
|
||||
|
||||
## Ausfuehrungsschritte
|
||||
|
||||
### Schritt 1 — Health-Check ausfuehren
|
||||
|
||||
Fuehre via Terminal (SSH) auf dem Host aus:
|
||||
|
||||
```bash
|
||||
python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
|
||||
```
|
||||
|
||||
Fuer einen gezielten Service:
|
||||
```bash
|
||||
python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py <service-key>
|
||||
```
|
||||
|
||||
Fuer den Gesamtstatus (Tier 1+2):
|
||||
```bash
|
||||
python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py --summary
|
||||
```
|
||||
|
||||
### Schritt 2 — JSON-Output interpretieren
|
||||
|
||||
Der Report enthaelt je Service:
|
||||
- `tier` — Kritikalitaet (1=Control Plane, 2=App, 3=Ops)
|
||||
- `container.healthy` — aktueller Gesundheitsstatus
|
||||
- `unhealthy_deps` — Liste der ebenfalls unhealthy Abhaengigkeiten
|
||||
- `dump.age_hours` — Alter des letzten Dumps in Stunden (>26h = Warnung)
|
||||
- `dump.warn` — true wenn Dump veraltet
|
||||
- `first_check` — erster Diagnoseschritt laut service catalog
|
||||
- `notes` — betriebliche Hinweise
|
||||
|
||||
### Schritt 3 — ntfy-Alert bauen
|
||||
|
||||
Baue eine ntfy-Nachricht nach diesem Format:
|
||||
|
||||
```
|
||||
[Titel]
|
||||
[Tier-Emoji] [service-key] unhealthy (Tier [N])
|
||||
|
||||
Beschreibung: [description]
|
||||
|
||||
Abhaengigkeiten:
|
||||
[✅/❌] [dep-key] — [status]
|
||||
|
||||
Letzter Dump: [age_hours]h alt [✅/⚠️] (oder: kein Dump konfiguriert)
|
||||
|
||||
Erster Check:
|
||||
[first_check]
|
||||
|
||||
Hinweis: [notes]
|
||||
```
|
||||
|
||||
Tier-Emojis: Tier 1 = 🔴, Tier 2 = 🟠, Tier 3 = 🟡
|
||||
Dump-Warnschwelle: >26 Stunden = ⚠️
|
||||
|
||||
### Schritt 4 — ntfy senden
|
||||
|
||||
```bash
|
||||
curl -s \
|
||||
-H "Title: [Tier N] [service-key] unhealthy" \
|
||||
-H "Priority: [urgent|high|default]" \
|
||||
-H "Tags: [warning,tier1|tier2|tier3]" \
|
||||
-d "[message]" \
|
||||
https://ntfy.kaleschke.info/homelab-alerts
|
||||
```
|
||||
|
||||
ntfy Prioritaeten:
|
||||
- Tier 1 → `urgent`
|
||||
- Tier 2 → `high`
|
||||
- Tier 3 → `default`
|
||||
|
||||
---
|
||||
|
||||
## Sonderfaelle
|
||||
|
||||
### Unbekannter Container (nicht in services.yaml)
|
||||
-> Alert senden mit Hinweis "nicht in services.yaml — bitte aktualisieren"
|
||||
-> services.yaml Pfad: `/mnt/user/services/homelab/ops/hermes-agent/services.yaml`
|
||||
|
||||
### ntfy selbst ist unhealthy
|
||||
-> Alert kann nicht per ntfy gesendet werden
|
||||
-> Hermes sendet stattdessen via Telegram (falls konfiguriert)
|
||||
-> Nachricht: "KRITISCH: ntfy ist unhealthy — kein Push-Alerting aktiv"
|
||||
|
||||
### Alle Tier-1-Abhaengigkeiten unhealthy
|
||||
-> Wahrscheinlich kein isoliertes Problem — Host oder Netzwerk pruefen
|
||||
-> Zusammenfassenden Alert senden statt Einzel-Alerts
|
||||
|
||||
### check_health.py nicht gefunden
|
||||
-> Meldung: "Script nicht gefunden unter /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py"
|
||||
-> Pruefe ob Komodo den Stack zuletzt deployed hat
|
||||
|
||||
---
|
||||
|
||||
## Cronjob-Empfehlung
|
||||
|
||||
Fuer automatische Checks ohne Uptime-Kuma-Webhook:
|
||||
|
||||
```
|
||||
# Jede Stunde — prueft alle unhealthy Container
|
||||
0 * * * * python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py
|
||||
|
||||
# Taeglich 07:00 — Gesamtstatus Tier 1+2
|
||||
0 7 * * * python3 /mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py --summary
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Nicht-Ziele dieses Skills
|
||||
|
||||
- **Kein automatischer Neustart** von Containern
|
||||
- **Kein Schreiben** in Compose-Dateien oder Konfigurationen
|
||||
- **Kein Deploy** via Komodo
|
||||
- **Keine Diagnose-Tiefe** jenseits des `first_check`-Hinweises (das ist Aufgabe des Benutzers)
|
||||
|
||||
---
|
||||
|
||||
## Verwandte Skills und Ressourcen
|
||||
|
||||
- `kallilab-homelab-ops` — Governance-Skill fuer Aenderungsentscheidungen
|
||||
- `services.yaml` — Wissensbasis: `/mnt/user/services/homelab/ops/hermes-agent/services.yaml`
|
||||
- `check_health.py` — Ausfuehrungs-Script: `/mnt/user/services/homelab/ops/hermes-agent/scripts/check_health.py`
|
||||
- Repo: `https://git.kaleschke.info` (origin/master)
|
||||
Reference in New Issue
Block a user