From ce747f687f98031f3e48921531a9354e9fd577ea Mon Sep 17 00:00:00 2001
From: Micha <michideheld@gmx.de>
Date: Wed, 10 Jun 2026 10:06:52 +0200
Subject: [PATCH] ops-report: cert-dedup, blackbox-DNS auf AdGuard, neue
 Noise-Patterns

Behebt drei Befunde aus dem Operations-Report 2026-06-10:

- daily-status-report.sh: Zertifikate werden vor der Auswertung pro
  Domain-Set dedupliziert; nur das laengstlaufende Cert zaehlt. Traefik
  haelt waehrend der Erneuerung altes + neues Cert in acme.json, was
  bisher eine falsche KRITISCH-Warnung (traefik.kaleschke.info 5 Tage)
  ausloeste, obwohl das neue Cert 65 Tage Restlaufzeit hat.

- monitoring/blackbox-exporter: DNS von 1.1.1.1/8.8.8.8 auf AdGuard
  (172.23.0.3 via dns_net) umgestellt. Externe Resolver lieferten die
  WAN-IP, was Hairpin-NAT-Timeouts (9,5s) bei Probes von cloud/glances
  verursachte (662 Fehler/Tag).

- log-noise.patterns: Fritz!Box-SOA-Fehler (AdGuard, RFC-1035-Verstoss)
  und fehlendes grafana-amazonprometheus-datasource-Plugin als bekanntes
  Rauschen klassifiziert (~1800 Zeilen/Tag).

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 monitoring/docker-compose.yml                 |  9 +++++++--
 services/posture-check/daily-status-report.sh | 10 +++++++++-
 services/posture-check/log-noise.patterns     | 17 ++++++++++++++++-
 3 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml
index 7042139..0cf093b 100644
--- a/monitoring/docker-compose.yml
+++ b/monitoring/docker-compose.yml
@@ -66,15 +66,18 @@ services:
     image: prom/blackbox-exporter:v0.28.0@sha256:e753ff9f3fc458d02cca5eddab5a77e1c175eee484a8925ac7d524f04366c2fc
     container_name: monitoring-blackbox-exporter
     restart: unless-stopped
+    # Use AdGuard so *.kaleschke.info resolves to the internal Traefik IP.
+    # External resolvers (1.1.1.1/8.8.8.8) return the public WAN IP, which
+    # causes hairpin-NAT timeouts when probing from inside the Docker network.
     dns:
-      - 1.1.1.1
-      - 8.8.8.8
+      - 172.23.0.3
     command:
       - --config.file=/etc/blackbox_exporter/blackbox.yml
     volumes:
       - ./blackbox/blackbox.yml:/etc/blackbox_exporter/blackbox.yml:ro
     networks:
       - monitoring_net
+      - dns_net
     expose:
       - "9115"
     security_opt:
@@ -367,6 +370,8 @@ networks:
     driver: bridge
   frontend_net:
     external: true
+  dns_net:
+    external: true
 
 volumes:
   prometheus_data:
diff --git a/services/posture-check/daily-status-report.sh b/services/posture-check/daily-status-report.sh
index e1b24bb..d3e100f 100755
--- a/services/posture-check/daily-status-report.sh
+++ b/services/posture-check/daily-status-report.sh
@@ -459,6 +459,10 @@ with open("/acme.json", "r", encoding="utf-8") as handle:
     data = json.load(handle)
 
 now = datetime.now(timezone.utc)
+# Deduplicate: for each unique set of domains keep only the longest-lived cert.
+# Traefik stores both the old and the newly-issued cert in acme.json during
+# the renewal window, which would otherwise produce a false warning.
+best = {}  # frozenset(domains) -> (days, expire_date_iso, names)
 for resolver in data.values():
     for cert in resolver.get("Certificates", []):
         domain = cert.get("domain", {}).get("main") or "-"
@@ -474,7 +478,11 @@ for resolver in data.values():
         not_after = datetime.strptime(decoded["notAfter"], "%b %d %H:%M:%S %Y %Z").replace(tzinfo=timezone.utc)
         days = (not_after - now).days
         names = ", ".join([domain, *sans])
-        print(f"{days}\t{not_after.date().isoformat()}\t{names}")
+        key = frozenset([domain, *sans])
+        if key not in best or days > best[key][0]:
+            best[key] = (days, not_after.date().isoformat(), names)
+for days, expires, names in best.values():
+    print(f"{days}\t{expires}\t{names}")
 PY
   then
     if [ ! -s "$cert_file" ]; then
diff --git a/services/posture-check/log-noise.patterns b/services/posture-check/log-noise.patterns
index 2e580e6..8d12d88 100644
--- a/services/posture-check/log-noise.patterns
+++ b/services/posture-check/log-noise.patterns
@@ -18,7 +18,7 @@
 # Removing a pattern: replace with a fresh attention example in the next
 # daily report and consult before reintroducing.
 #
-# Last reviewed: 2026-05-21
+# Last reviewed: 2026-06-10
 
 # Loki internal query cancellations / scheduler chatter.
 # Why: Loki cancels internal queries continuously when downstream Promtails
@@ -72,3 +72,18 @@ authelia.*Request timeout occurred.*status_code=408
 #       noise becomes overwhelming, add a *narrow* pattern restricted to
 #       push contexts only (e.g. `vaultwarden.*push.*(ResolveError|...)`).
 vaultwarden.*(Token has expired|Invalid refresh token|Failed to decode.*refresh_token|POST /identity/connect/token => 401 Unauthorized)
+
+# AdGuard: Fritz!Box sends malformed SOA queries for myfritz.net / myfritz.link.
+# Why: AVM Fritz!Box devices send multi-question DNS SOA queries that violate
+#      RFC 1035 ("only 1 question allowed"). AdGuard rejects them with an error
+#      but they have no operational impact.
+# Re-check: if the same error appears for non-AVM domains, or if rate spikes
+#           well above 1000/day without a Fritz!Box reboot explaining it.
+adguard.*bad question section.*only 1 question allowed
+
+# Grafana: usage-stats collector looks for the Amazon Prometheus plugin, which
+# is not installed in this setup. The error is emitted once per stats cycle.
+# Why: GF_PLUGINS_PREINSTALL_DISABLED=true keeps the plugin list minimal;
+#      this lookup is harmless and does not affect any dashboard.
+# Re-check: only if Amazon Prometheus is added as a datasource.
+monitoring-grafana.*grafana-amazonprometheus-datasource not found