monitoring + backup: Stale-Handle-Hardening und Dead-Man's-Switch

Schliesst den lokalen Code-Stand fuer zwei offene MASTER_TODO-Punkte ab.

monitoring: restliche Einzeldatei-Bind-Mounts (alertmanager, blackbox,
loki, promtail, alertmanager-ntfy-bridge) auf Directory-Mounts umgestellt,
analog zum Prometheus-Fix vom 2026-06-19. Vermeidet "Stale NFS file handle"
auf dem /mnt/user-FUSE-Share bei git/Komodo-Updates. grafana-provisioning
war bereits Directory-Mount. `docker compose config` gruen. Beim Deploy
--force-recreate noetig, da sich Mount-Zielpfade aendern.

backup: endpoint-agnostischer Dead-Man's-Switch (Healthchecks-kompatibel,
Cloud oder self-hosted) in pull-critical-backups.ps1 und pre-borg.sh.
Pings /start, Erfolg und /fail; No-Op ohne konfigurierte URL, bricht also
keinen Lauf. Ping-URLs sind Capability-URLs und bleiben als Secret
ausserhalb des Repos.

Doku: SECRETS_MAP, Nearline-README und MASTER_TODO nachgezogen.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2026-06-21 17:54:53 +02:00
parent 7ff6a24c9d
commit f296338530
6 changed files with 162 additions and 51 deletions
+72 -38
View File
@@ -1,11 +1,35 @@
param(
[string]$SourceRoot = "\\192.168.178.58\backups",
[string]$DestinationRoot = "H:\kallilab-nearline-backups",
[string]$HealthchecksUrl = $env:HEALTHCHECKS_NEARLINE_URL,
[switch]$WhatIf
)
$ErrorActionPreference = "Stop"
# Externer Dead-Man's-Switch (endpoint-agnostisch: Healthchecks.io-Cloud oder
# self-hosted). Bleibt der Erfolgs-Ping aus, alarmiert der externe Dienst von
# aussen - genau den Fall, den Prometheus auf Unraid fuer den baerchen-Pull
# nicht sieht. Die Ping-URL ist eine Capability-URL -> wie ein Secret behandeln,
# niemals ins Repo. Quelle: -HealthchecksUrl -> $env:HEALTHCHECKS_NEARLINE_URL
# -> Datei im Userprofil. Ist keine URL gesetzt, ist der Switch ein No-Op.
if (-not $HealthchecksUrl) {
$hcUrlFile = Join-Path $HOME ".kallilab\healthchecks-nearline-url.txt"
if (Test-Path -LiteralPath $hcUrlFile) {
$HealthchecksUrl = (Get-Content -LiteralPath $hcUrlFile -Raw).Trim()
}
}
function Send-HealthcheckPing {
param([string]$Suffix = "")
if (-not $HealthchecksUrl) { return }
try {
Invoke-RestMethod -Uri ("{0}{1}" -f $HealthchecksUrl, $Suffix) -Method Get -TimeoutSec 15 | Out-Null
} catch {
Write-Warning "Healthchecks ping ('$Suffix') failed: $($_.Exception.Message)"
}
}
$Jobs = @(
@{
Name = "borg-dumps-latest"
@@ -145,44 +169,54 @@ if ($WhatIf) {
exit 0
}
$destinationDrive = Split-Path -Qualifier $DestinationRoot
Assert-PathExists -Path $destinationDrive -Label "Destination drive"
# Echter Lauf -> Dead-Man's-Switch aktiv. /start misst die Laufzeit, /fail
# meldet einen abgebrochenen Lauf sofort, der Erfolgs-Ping am Ende bestaetigt.
Send-HealthcheckPing "/start"
try {
$destinationDrive = Split-Path -Qualifier $DestinationRoot
Assert-PathExists -Path $destinationDrive -Label "Destination drive"
$logRoot = Join-Path $DestinationRoot "_logs"
$reportRoot = Join-Path $DestinationRoot "_reports"
New-Item -ItemType Directory -Force -Path $DestinationRoot, $logRoot, $reportRoot | Out-Null
$logRoot = Join-Path $DestinationRoot "_logs"
$reportRoot = Join-Path $DestinationRoot "_reports"
New-Item -ItemType Directory -Force -Path $DestinationRoot, $logRoot, $reportRoot | Out-Null
$results = foreach ($job in $Jobs) {
Invoke-RobocopyJob -Job $job -LogRoot $logRoot
$results = foreach ($job in $Jobs) {
Invoke-RobocopyJob -Job $job -LogRoot $logRoot
}
$reportPath = Join-Path $reportRoot ("nearline-pull-{0}.md" -f (Get-Date -Format "yyyy-MM-dd-HHmmss"))
$lines = @()
$lines += "# H:/ Nearline Pull Report - $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
$lines += ""
$lines += "- Source root: ``$SourceRoot``"
$lines += "- Destination root: ``$DestinationRoot``"
$lines += "- Mode: non-destructive copy, no ``/MIR``, no purge"
$lines += ""
$lines += "| Job | Exit code | Source | Destination | Log |"
$lines += "|---|---:|---|---|---|"
foreach ($result in $results) {
$lines += "| $($result.Name) | $($result.ExitCode) | ``$($result.Source)`` | ``$($result.Destination)`` | ``$($result.Log)`` |"
}
$lines += ""
$lines += "Expected critical artifacts after run:"
$lines += ""
$lines += "- ``borg-dumps/latest/immich.dump``"
$lines += "- ``borg-dumps/latest/komodo-mongo.archive.gz``"
$lines += "- ``git-bundles/gitea/latest-report.md``"
$lines += "- ``git-bundles/gitea/micha/*.bundle``"
$lines += ""
$lines += "Bewusst NICHT in Nearline-Scope:"
$lines += ""
$lines += "- ``unraid-flash-config.tar.gz`` (hostseitig 0600 root:root; Restore aus Hetzner-Borg)"
$lines | Set-Content -LiteralPath $reportPath -Encoding UTF8
Write-Host "H:/ nearline pull completed."
Write-Host "Report: $reportPath"
Send-HealthcheckPing
} catch {
Send-HealthcheckPing "/fail"
throw
}
$reportPath = Join-Path $reportRoot ("nearline-pull-{0}.md" -f (Get-Date -Format "yyyy-MM-dd-HHmmss"))
$lines = @()
$lines += "# H:/ Nearline Pull Report - $(Get-Date -Format 'yyyy-MM-dd HH:mm:ss')"
$lines += ""
$lines += "- Source root: ``$SourceRoot``"
$lines += "- Destination root: ``$DestinationRoot``"
$lines += "- Mode: non-destructive copy, no ``/MIR``, no purge"
$lines += ""
$lines += "| Job | Exit code | Source | Destination | Log |"
$lines += "|---|---:|---|---|---|"
foreach ($result in $results) {
$lines += "| $($result.Name) | $($result.ExitCode) | ``$($result.Source)`` | ``$($result.Destination)`` | ``$($result.Log)`` |"
}
$lines += ""
$lines += "Expected critical artifacts after run:"
$lines += ""
$lines += "- ``borg-dumps/latest/immich.dump``"
$lines += "- ``borg-dumps/latest/komodo-mongo.archive.gz``"
$lines += "- ``git-bundles/gitea/latest-report.md``"
$lines += "- ``git-bundles/gitea/micha/*.bundle``"
$lines += ""
$lines += "Bewusst NICHT in Nearline-Scope:"
$lines += ""
$lines += "- ``unraid-flash-config.tar.gz`` (hostseitig 0600 root:root; Restore aus Hetzner-Borg)"
$lines | Set-Content -LiteralPath $reportPath -Encoding UTF8
Write-Host "H:/ nearline pull completed."
Write-Host "Report: $reportPath"