diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml index 7042139..0cf093b 100644 --- a/monitoring/docker-compose.yml +++ b/monitoring/docker-compose.yml @@ -66,15 +66,18 @@ services: image: prom/blackbox-exporter:v0.28.0@sha256:e753ff9f3fc458d02cca5eddab5a77e1c175eee484a8925ac7d524f04366c2fc container_name: monitoring-blackbox-exporter restart: unless-stopped + # Use AdGuard so *.kaleschke.info resolves to the internal Traefik IP. + # External resolvers (1.1.1.1/8.8.8.8) return the public WAN IP, which + # causes hairpin-NAT timeouts when probing from inside the Docker network. dns: - - 1.1.1.1 - - 8.8.8.8 + - 172.23.0.3 command: - --config.file=/etc/blackbox_exporter/blackbox.yml volumes: - ./blackbox/blackbox.yml:/etc/blackbox_exporter/blackbox.yml:ro networks: - monitoring_net + - dns_net expose: - "9115" security_opt: @@ -367,6 +370,8 @@ networks: driver: bridge frontend_net: external: true + dns_net: + external: true volumes: prometheus_data: diff --git a/services/posture-check/daily-status-report.sh b/services/posture-check/daily-status-report.sh index e1b24bb..d3e100f 100755 --- a/services/posture-check/daily-status-report.sh +++ b/services/posture-check/daily-status-report.sh @@ -459,6 +459,10 @@ with open("/acme.json", "r", encoding="utf-8") as handle: data = json.load(handle) now = datetime.now(timezone.utc) +# Deduplicate: for each unique set of domains keep only the longest-lived cert. +# Traefik stores both the old and the newly-issued cert in acme.json during +# the renewal window, which would otherwise produce a false warning. +best = {} # frozenset(domains) -> (days, expire_date_iso, names) for resolver in data.values(): for cert in resolver.get("Certificates", []): domain = cert.get("domain", {}).get("main") or "-" @@ -474,7 +478,11 @@ for resolver in data.values(): not_after = datetime.strptime(decoded["notAfter"], "%b %d %H:%M:%S %Y %Z").replace(tzinfo=timezone.utc) days = (not_after - now).days names = ", ".join([domain, *sans]) - print(f"{days}\t{not_after.date().isoformat()}\t{names}") + key = frozenset([domain, *sans]) + if key not in best or days > best[key][0]: + best[key] = (days, not_after.date().isoformat(), names) +for days, expires, names in best.values(): + print(f"{days}\t{expires}\t{names}") PY then if [ ! -s "$cert_file" ]; then diff --git a/services/posture-check/log-noise.patterns b/services/posture-check/log-noise.patterns index 2e580e6..8d12d88 100644 --- a/services/posture-check/log-noise.patterns +++ b/services/posture-check/log-noise.patterns @@ -18,7 +18,7 @@ # Removing a pattern: replace with a fresh attention example in the next # daily report and consult before reintroducing. # -# Last reviewed: 2026-05-21 +# Last reviewed: 2026-06-10 # Loki internal query cancellations / scheduler chatter. # Why: Loki cancels internal queries continuously when downstream Promtails @@ -72,3 +72,18 @@ authelia.*Request timeout occurred.*status_code=408 # noise becomes overwhelming, add a *narrow* pattern restricted to # push contexts only (e.g. `vaultwarden.*push.*(ResolveError|...)`). vaultwarden.*(Token has expired|Invalid refresh token|Failed to decode.*refresh_token|POST /identity/connect/token => 401 Unauthorized) + +# AdGuard: Fritz!Box sends malformed SOA queries for myfritz.net / myfritz.link. +# Why: AVM Fritz!Box devices send multi-question DNS SOA queries that violate +# RFC 1035 ("only 1 question allowed"). AdGuard rejects them with an error +# but they have no operational impact. +# Re-check: if the same error appears for non-AVM domains, or if rate spikes +# well above 1000/day without a Fritz!Box reboot explaining it. +adguard.*bad question section.*only 1 question allowed + +# Grafana: usage-stats collector looks for the Amazon Prometheus plugin, which +# is not installed in this setup. The error is emitted once per stats cycle. +# Why: GF_PLUGINS_PREINSTALL_DISABLED=true keeps the plugin list minimal; +# this lookup is harmless and does not affect any dashboard. +# Re-check: only if Amazon Prometheus is added as a datasource. +monitoring-grafana.*grafana-amazonprometheus-datasource not found