services: prometheus: image: prom/prometheus:v3.12.0@sha256:69f5241418838263316593f7274a304b095c40bcf22e57272865da91bd60a8ac container_name: monitoring-prometheus restart: unless-stopped command: - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.path=/prometheus - --storage.tsdb.retention.time=30d - --web.enable-lifecycle volumes: - ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro - ./prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro - prometheus_data:/prometheus networks: - monitoring_net expose: - "9090" security_opt: - no-new-privileges:true depends_on: - alertmanager - blackbox-exporter - node-exporter - cadvisor alertmanager: image: prom/alertmanager:v0.32.1@sha256:51a825c2a40acc3e338fdd00d622e01ec090f72be2b3ea46be0839cd47a4d286 container_name: monitoring-alertmanager restart: unless-stopped command: - --config.file=/etc/alertmanager/alertmanager.yml - --storage.path=/alertmanager volumes: - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro - alertmanager_data:/alertmanager networks: - monitoring_net expose: - "9093" security_opt: - no-new-privileges:true alertmanager-ntfy-bridge: image: python:3.14-alpine@sha256:5a824eb82cc75361f98611f3cfc5091ea33f10a6ccea4d4ebdabbc523b9a1614 container_name: monitoring-alertmanager-ntfy-bridge restart: unless-stopped dns: - 1.1.1.1 - 8.8.8.8 environment: NTFY_URL: https://ntfy.kaleschke.info/homelab-alerts command: - python - /app/bridge.py volumes: - ./alertmanager-ntfy-bridge/bridge.py:/app/bridge.py:ro networks: - monitoring_net expose: - "8080" security_opt: - no-new-privileges:true blackbox-exporter: image: prom/blackbox-exporter:v0.28.0@sha256:e753ff9f3fc458d02cca5eddab5a77e1c175eee484a8925ac7d524f04366c2fc container_name: monitoring-blackbox-exporter restart: unless-stopped dns: - 1.1.1.1 - 8.8.8.8 command: - --config.file=/etc/blackbox_exporter/blackbox.yml volumes: - ./blackbox/blackbox.yml:/etc/blackbox_exporter/blackbox.yml:ro networks: - monitoring_net expose: - "9115" security_opt: - no-new-privileges:true loki: image: grafana/loki:3.7.2@sha256:191d4fdfb7264f16989f0a57f320872620a5a7c2ceeec6229212c4190ec49b86 container_name: monitoring-loki restart: unless-stopped command: - -config.file=/etc/loki/loki-config.yml volumes: - ./loki/loki-config.yml:/etc/loki/loki-config.yml:ro - loki_data:/loki networks: - monitoring_net expose: - "3100" security_opt: - no-new-privileges:true promtail: image: grafana/promtail:3.6.11@sha256:a761cb834cfaeee29745440d4884d6748f0a08d8f68928db1d707018c1dcfbe9 container_name: monitoring-promtail restart: unless-stopped command: - -config.file=/etc/promtail/promtail-config.yml volumes: - ./promtail/promtail-config.yml:/etc/promtail/promtail-config.yml:ro - promtail_positions:/positions - /var/run/docker.sock:/var/run/docker.sock:ro - /var/lib/docker/containers:/var/lib/docker/containers:ro networks: - monitoring_net security_opt: - no-new-privileges:true depends_on: - loki grafana: image: grafana/grafana:13.0.1@sha256:0f86bada30d65ef9d0183b90c1e2682ac92d53d95da8bed322b984ea78a4a73a container_name: monitoring-grafana user: "0" restart: unless-stopped dns: - 1.1.1.1 - 8.8.8.8 environment: GF_SERVER_ROOT_URL: https://monitoring.kaleschke.info/ GF_SECURITY_ADMIN_USER: admin GF_SECURITY_ADMIN_PASSWORD__FILE: /run/secrets/monitoring_grafana_admin_password GF_USERS_ALLOW_SIGN_UP: "false" GF_AUTH_ANONYMOUS_ENABLED: "false" GF_PLUGINS_PREINSTALL_DISABLED: "true" entrypoint: - /bin/sh - -c - | export GRAFANA_INFLUXDB_TOKEN="$$(cat /run/secrets/monitoring_grafana_influxdb_token)" exec /run.sh volumes: - grafana_data:/var/lib/grafana - ./grafana/provisioning:/etc/grafana/provisioning:ro - ./grafana/dashboards:/var/lib/grafana/dashboards:ro networks: - monitoring_net - frontend_net secrets: - monitoring_grafana_admin_password - monitoring_grafana_influxdb_token expose: - "3000" security_opt: - no-new-privileges:true depends_on: - prometheus - loki - influxdb3-core labels: - traefik.enable=true - traefik.docker.network=frontend_net - traefik.http.routers.monitoring-grafana.rule=Host(`monitoring.kaleschke.info`) - traefik.http.routers.monitoring-grafana.entrypoints=websecure - traefik.http.routers.monitoring-grafana.tls=true - traefik.http.routers.monitoring-grafana.tls.certresolver=le - traefik.http.routers.monitoring-grafana.middlewares=authelia@file,secure-headers@file - traefik.http.services.monitoring-grafana.loadbalancer.server.port=3000 grafana-dashboard-importer: image: python:3.14-alpine container_name: monitoring-grafana-dashboard-importer restart: "no" profiles: - bootstrap dns: - 1.1.1.1 - 8.8.8.8 networks: - monitoring_net - frontend_net security_opt: - no-new-privileges:true depends_on: - grafana secrets: - monitoring_grafana_admin_password command: - /bin/sh - -c - | python - <<'PY' import base64 import json import time import urllib.error import urllib.request grafana_url = "http://grafana:3000" with open("/run/secrets/monitoring_grafana_admin_password", encoding="utf-8") as secret: password = secret.read().strip() auth = base64.b64encode(f"admin:{password}".encode()).decode() headers = { "Authorization": f"Basic {auth}", "Content-Type": "application/json", } def request(path, payload=None, timeout=20): data = None if payload is None else json.dumps(payload).encode() req = urllib.request.Request(f"{grafana_url}{path}", data=data, headers=headers) if payload is not None: req.method = "POST" with urllib.request.urlopen(req, timeout=timeout) as response: body = response.read() return json.loads(body.decode() or "{}") if body else {} def import_dashboard(payload, dashboard_id): for attempt in range(1, 7): try: return request("/api/dashboards/import", payload) except urllib.error.HTTPError as exc: body = exc.read().decode(errors="replace")[:300] if attempt == 6: raise RuntimeError(f"Dashboard {dashboard_id} import failed: {exc.code} {body}") from exc print(f"Dashboard {dashboard_id} import attempt {attempt} failed: HTTP {exc.code} {body}") time.sleep(5) except Exception as exc: if attempt == 6: raise print(f"Dashboard {dashboard_id} import attempt {attempt} failed: {exc}") time.sleep(5) for _ in range(60): try: request("/api/health", timeout=5) break except Exception: time.sleep(2) else: raise SystemExit("Grafana did not become ready in time") dashboards = [ (17346, "Prometheus"), ] def fetch_dashboard(dashboard_id): url = f"https://grafana.com/api/dashboards/{dashboard_id}/revisions/latest/download" for attempt in range(1, 7): try: with urllib.request.urlopen(url, timeout=30) as response: return json.loads(response.read().decode()) except Exception as exc: if attempt == 6: raise print(f"Dashboard {dashboard_id} download attempt {attempt} failed: {exc}") time.sleep(5) for dashboard_id, default_datasource in dashboards: dashboard = fetch_dashboard(dashboard_id) inputs = [] for item in dashboard.get("__inputs", []): plugin_id = item.get("pluginId", "").lower() value = "Loki" if plugin_id == "loki" or default_datasource == "Loki" else "Prometheus" inputs.append({ "name": item.get("name"), "type": item.get("type", "datasource"), "pluginId": item.get("pluginId", "prometheus"), "value": value, }) import_dashboard({ "dashboard": dashboard, "overwrite": True, "inputs": inputs, }, dashboard_id) print(f"Imported Grafana dashboard {dashboard_id}") PY echo "Dashboard import complete." node-exporter: image: prom/node-exporter:v1.11.1@sha256:e9cff4fc67b1818f8c97adb115b9f12c9a54b533de86765d4a0effc01b357205 container_name: monitoring-node-exporter restart: unless-stopped command: - --path.procfs=/host/proc - --path.sysfs=/host/sys - --path.rootfs=/rootfs - --collector.textfile.directory=/textfile - --collector.filesystem.mount-points-exclude=^/(dev|proc|sys|run|var/lib/docker/.+|var/lib/containers/storage/.+)($|/) volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro - /mnt/user/services/posture-check/textfile:/textfile:ro networks: - monitoring_net expose: - "9100" security_opt: - no-new-privileges:true cadvisor: image: ghcr.io/google/cadvisor:v0.57.0@sha256:e75bdb03b74b0b6995f208f166fead2e6e555dde73e44200113bb26f41b1981d container_name: monitoring-cadvisor restart: unless-stopped command: - --docker_only=true - --housekeeping_interval=30s - --store_container_labels=false volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker:/var/lib/docker:ro - /dev/disk:/dev/disk:ro networks: - monitoring_net expose: - "8080" security_opt: - no-new-privileges:true influxdb3-core: image: influxdb:3.9.2-core@sha256:31ad94df2248134989b2cf73d965e51dd5f35dfae22d7ed8f4776b12e6f69f4e container_name: monitoring-influxdb3-core user: "0" restart: unless-stopped ports: - "${INFLUXDB_BIND_IP:-127.0.0.1}:8181:8181" command: - influxdb3 - serve - --node-id=kallilabcore - --object-store=file - --data-dir=/var/lib/influxdb3/data - --plugin-dir=/var/lib/influxdb3/plugins - --admin-token-file=/run/secrets/influxdb3_admin_token volumes: - /mnt/user/appdata/influxdb3/data:/var/lib/influxdb3/data - /mnt/user/appdata/influxdb3/plugins:/var/lib/influxdb3/plugins secrets: - influxdb3_admin_token networks: - monitoring_net - monitoring_influx_lan security_opt: - no-new-privileges:true networks: monitoring_net: name: monitoring_net driver: bridge monitoring_influx_lan: driver: bridge frontend_net: external: true volumes: prometheus_data: alertmanager_data: loki_data: promtail_positions: grafana_data: secrets: monitoring_grafana_admin_password: file: /mnt/user/appdata/secrets/monitoring_grafana_admin_password.txt monitoring_grafana_influxdb_token: file: /mnt/user/appdata/secrets/monitoring_grafana_influxdb_token.txt influxdb3_admin_token: file: /mnt/user/appdata/secrets/influxdb3_admin_token.json